
    h&                         S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJrJrJr  SSKJrJrJrJrJrJrJrJrJr  SSKJr  \R6                  R9                  S	S
5        \S:X  Ga  \" \\\45      r\R?                  5       u  r r!r"SS0\!l#        \"RH                  S;   a  \"RH                  O\%" \\"RH                  5      r$\&" \"RN                  \"RP                  \$\!RR                  (       a  SOSS9r*\" \"5      r+\+b  \" 5       \*S'   \+\*S'   \RX                  " \"RZ                  5      r.\%" \\.R^                  S   5      r0\0RX                  " \"RZ                  4S\"Rb                  0\*D6r2Sr3Sr4\RX                  " \"RZ                  \"Rb                  S9r5\6" \5S5      (       aM  S\5Rn                  l8        \5Rn                  Rr                  c%  \5Rn                  Rt                  \5Rn                  l;        \" SSS9r<\<R{                  SSS9r<Sr>S r?\<R                  \?5      r<S rA\<R                  \A5      r<S  rC\<R                  \C5      r<\<S   rD\!R                  S!:w  a  \<S"   OSrFS#\G\H   4S$ jrI\" \2\\I/\!\D\F\5\" \"5      S%9rJ\!R                  S!:w  a5  \" \!R                  S\!R                  S&9rM\" \J\MS'S(9rN\JR                  \N5        \JR                  5         \JR                  \!R                  5        \!R                  (       a  \JR                  SS)9  ggg)*a  
pip install math_verify

# For Qwen/Qwen2.5-VL-3B-Instruct
accelerate launch     --config_file examples/accelerate_configs/deepspeed_zero3.yaml     examples/scripts/online_dpo_vlm.py     --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct     --reward_model_path Qwen/Qwen2.5-VL-3B-Instruct     --output_dir online-dpo-Qwen2.5-VL-3B-Instruct     --learning_rate 1e-5     --gradient_checkpointing     --dtype bfloat16     --max_length 1536     --max_new_tokens 1024     --use_vllm     --vllm_mode server     --use_peft     --lora_target_modules "q_proj", "v_proj"     --per_device_train_batch_size 1     --gradient_accumulation_steps 2

# For HuggingFaceTB/SmolVLM2-2.2B-Instruct
pip install num2words

accelerate launch     --config_file examples/accelerate_configs/deepspeed_zero3.yaml     examples/scripts/online_dpo_vlm.py     --model_name_or_path HuggingFaceTB/SmolVLM2-2.2B-Instruct     --reward_model_path HuggingFaceTB/SmolVLM2-2.2B-Instruct     --output_dir online-dpo-SmolVLM2-2.2B-Instruct     --learning_rate 1e-5     --dtype bfloat16     --max_length 1536     --max_new_tokens 1024     --use_peft     --lora_target_modules "q_proj", "v_proj"     --per_device_train_batch_size 1     --gradient_accumulation_steps 2

# Single GPU test command:
python examples/scripts/online_dpo_vlm.py     --model_name_or_path HuggingFaceTB/SmolVLM2-2.2B-Instruct     --reward_model_path HuggingFaceTB/SmolVLM2-2.2B-Instruct     --output_dir online-dpo-SmolVLM2-2.2B-Instruct-test     --learning_rate 1e-5     --dtype bfloat16     --max_length 1536     --max_new_tokens 128     --use_peft     --lora_target_modules "q_proj", "v_proj"     --per_device_train_batch_size 1     --gradient_accumulation_steps 1     --max_steps 2     --logging_steps 1     --trust_remote_code
    N)load_dataset)NormalizationConfig)LatexExtractionConfigparseverify)
AutoConfigAutoProcessorGenerationConfig)	LogCompletionsCallbackModelConfigOnlineDPOConfigOnlineDPOTrainerScriptArguments	TrlParserget_kbit_device_mapget_peft_configget_quantization_config)think_format_rewardTRACKIO_SPACE_IDztrl-trackio__main__use_reentrantT)autoNF)revisionattn_implementationdtype	use_cache
device_mapquantization_configtrust_remote_code)r   	tokenizerleftz'lmms-lab/multimodal-open-r1-8k-verifiedtrain)splitd   *   )	test_sizeseeda[  A conversation between user and assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think></think> tags, i.e., <think>
This is my reasoning.
</think>
This is my answer.c                 2    S[         S.SU S   S./nXS   S.$ )Nsystem)rolecontentuserproblemimage)promptr.   )SYSTEM_PROMPT)exampler/   s     Y/home/james-whalen/.local/lib/python3.13/site-packages/examples/scripts/online_dpo_vlm.pymake_conversationr3      s0     -8	(:;
 !7+;<<    c                 b    U S   nUR                   S   S:  =(       a    UR                   S   S:  $ )Nr.   r   i      )sizer1   r.   s     r2   filter_big_imagesr9      s1     zz!}s":uzz!}s'::r4   c                 Z    U S   nUR                   S:w  a  UR                  S5      nXS'   U $ )Nr.   RGB)modeconvertr8   s     r2   convert_to_rgbr>      s3     ::MM%(E r4   notestsolutionc                 6   / nU  Vs/ s H
  oDS   S   PM     nn[        XQ5       H  u  pg [        USS9n[        U5      S:w  a6   [        U[	        [        SSSSSS9SSS	9/SS
9n	[        [        X5      5      n
OF[        UR                  5       R                  5       UR                  5       R                  5       :H  5      n
UR                  U
5        M     U$ s  snf ! [         a    / n Nf = f! [         a   n[        SU SU SU 35        Sn
 SnANSSnAff = f)u   Reward function that checks if the completion matches the ground truth.
- If both gold and prediction are parseable → use math verification.
- If not parseable → compare as normalized text.
r   r+   first_match)extraction_modeFTall)nitsmalformed_operatorsbasic_latexboxedunits)normalization_configboxed_match_prioritytry_extract_without_anchor)extraction_configrD   zverify failed: z
, answer: z, gold: N)zipr   	Exceptionlenr   r   floatr   printstriplowerappend)completionsrA   kwargsrewards
completioncontentsr+   solgold_parsedanswer_parsedrewardes               r2   accuracy_rewardra      s;   
 ?JK{qM),{K3LG!#CG ;1$"$)15H).8=04*/*.6" 67;@
+ )6%M" #6+#EFF w}}446#))+:K:K:MMNNN6"E 4H K L  ! !0 ! "OA3j	#OP!F"s.   C
C4C.C+*C+.
D8DD)modelreward_funcsargstrain_dataseteval_datasetprocessing_classpeft_config)max_new_tokens	do_sampletemperature   )num_prompts)dataset_name)T__doc__ostorchtransformersdatasetsr   latex2sympy2_extendedr   math_verifyr   r   r   r   r	   r
   trlr   r   r   r   r   r   r   r   r   trl.rewardsr   environ
setdefault__name__parserparse_args_and_configscript_argstraining_args
model_argsgradient_checkpointing_kwargsr   getattrdictmodel_revisionr   gradient_checkpointingmodel_kwargsr   from_pretrainedmodel_name_or_pathconfigarchitecturesarchitecturer   rb   reward_modelreward_processor	processorhasattrr    padding_sidepad_token_id	eos_token	pad_tokendatasettrain_test_splitr0   r3   mapr9   filterr>   re   eval_strategyrf   liststrra   trainerri   rk   generation_configcompletions_callbackadd_callbackr"   
save_model
output_dirpush_to_hub r4   r2   <module>r      sK  68t 
   ! 5 < < D D
 
 
 , 

  (- 8 z+FGF-3-I-I-K*K
3BD2IM/ * 0 0N BJPUWaWgWgHhE**&::(??%T	L 2*=&%8%:\".A*+ ''
(E(EFF<)=)=a)@AL((%%9C9U9UYeE L --%%$66I y+&&+1	(++3,5,?,?,I,II)
 DGTG&&2&>G	3 = kk+,G; nn./G kk.)GG$M&3&A&AT&I76?tL
+tCy +` )?;#!"#J/G ""d*,(774UbUnUn
  6g?P^_`12MMO }//0  )RS !y r4   