
    h              
          S r SSKrSSKrSSKJr  SSKJr  SSKJrJ	r	J
r
  SSKJrJrJrJrJrJrJrJr  SSKJr  \R,                  R/                  SS	5        \S
:X  Ga  \" \\\45      r\R5                  5       u  rrr\R<                  S;   a  \R<                  O\" \\R<                  5      r\ " \RB                  \RD                  \S9\l#        \" \5      r$\$b#  \" 5       \RF                  S'   \$\RF                  S'   \" SSS9r%\%RM                  SSS9r%Sr'S r(\%RS                  \(5      r%S r*\%RW                  \*5      r%S r,\%RS                  \,5      r%\%S   r-\R\                  S:w  a  \%S   OSr/S\0\1   4S jr2\" \Rf                  \\\2/\-\/\" \5      S9r4\4Rk                  5         \4Rm                  \Rn                  5        \Rp                  (       a  \4Rq                  \Rr                  S9  ggg)aZ  
pip install math_verify

# For Qwen/Qwen2.5-VL-3B-Instruct
accelerate launch     --config_file examples/accelerate_configs/deepspeed_zero3.yaml     examples/scripts/grpo_vlm.py     --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct     --output_dir grpo-Qwen2.5-VL-3B-Instruct     --learning_rate 1e-5     --gradient_checkpointing     --dtype bfloat16     --max_prompt_length 2048     --max_completion_length 1024     --use_vllm     --vllm_mode colocate     --use_peft     --lora_target_modules "q_proj", "v_proj"     --log_completions

# For HuggingFaceTB/SmolVLM2-2.2B-Instruct
pip install num2words

accelerate launch     --config_file examples/accelerate_configs/deepspeed_zero3.yaml     examples/scripts/grpo_vlm.py     --model_name_or_path HuggingFaceTB/SmolVLM2-2.2B-Instruct     --output_dir grpo-SmolVLM2-2.2B-Instruct     --learning_rate 1e-5     --dtype bfloat16     --max_prompt_length 2048     --max_completion_length 1024     --use_peft     --lora_target_modules "q_proj", "v_proj"     --log_completions     --per_device_train_batch_size 1     --gradient_accumulation_steps 2     --num_generations 2

    N)load_dataset)NormalizationConfig)LatexExtractionConfigparseverify)
GRPOConfigGRPOTrainerModelConfigScriptArguments	TrlParserget_kbit_device_mapget_peft_configget_quantization_config)think_format_rewardTRACKIO_SPACE_IDztrl-trackio__main__)autoN)revisionattn_implementationdtype
device_mapquantization_configz'lmms-lab/multimodal-open-r1-8k-verifiedtrain)splitd   *   )	test_sizeseeda[  A conversation between user and assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think></think> tags, i.e., <think>
This is my reasoning.
</think>
This is my answer.c                 ,    S[         S.SU S   S./nSU0$ )Nsystem)rolecontentuserproblemprompt)SYSTEM_PROMPT)exampler%   s     S/home/james-whalen/.local/lib/python3.13/site-packages/examples/scripts/grpo_vlm.pymake_conversationr)   |   s+    -8	(:;
 &!!    c                 b    U S   nUR                   S   S:  =(       a    UR                   S   S:  $ )Nimager   i      )sizer'   r,   s     r(   filter_big_imagesr0      s1     zz!}s":uzz!}s'::r*   c                 Z    U S   nUR                   S:w  a  UR                  S5      nXS'   U $ )Nr,   RGB)modeconvertr/   s     r(   convert_to_rgbr5      s3     ::MM%(E r*   notestsolutionc                 6   / nU  Vs/ s H
  oDS   S   PM     nn[        XQ5       H  u  pg [        USS9n[        U5      S:w  a6   [        U[	        [        SSSSSS9SSS	9/SS
9n	[        [        X5      5      n
OF[        UR                  5       R                  5       UR                  5       R                  5       :H  5      n
UR                  U
5        M     U$ s  snf ! [         a    / n Nf = f! [         a   n[        SU SU SU 35        Sn
 SnANSSnAff = f)u   Reward function that checks if the completion matches the ground truth.
- If both gold and prediction are parseable → use math verification.
- If not parseable → compare as normalized text.
r   r"   first_match)extraction_modeFTall)nitsmalformed_operatorsbasic_latexboxedunits)normalization_configboxed_match_prioritytry_extract_without_anchor)extraction_configr;   zverify failed: z
, answer: z, gold: N)zipr   	Exceptionlenr   r   floatr   printstriplowerappend)completionsr8   kwargsrewards
completioncontentsr"   solgold_parsedanswer_parsedrewardes               r(   accuracy_rewardrX      s;   
 ?JK{qM),{K3LG!#CG ;1$"$)15H).8=04*/*.6" 67;@
+ )6%M" #6+#EFF w}}446#))+:K:K:MMNNN6"E 4H K L  ! !0 ! "OA3j	#OP!F"s.   C
C4C.C+*C+.
D8DD)modelargsreward_funcstrain_dataseteval_datasetpeft_config)dataset_name):__doc__ostorchdatasetsr   latex2sympy2_extendedr   math_verifyr   r   r   trlr   r	   r
   r   r   r   r   r   trl.rewardsr   environ
setdefault__name__parserparse_args_and_configscript_argstraining_args
model_argsr   getattrdictmodel_revisionr   model_init_kwargsr   datasettrain_test_splitr&   r)   mapr0   filterr5   r\   eval_strategyr]   liststrrX   model_name_or_pathtrainerr   
save_model
output_dirpush_to_hubr_    r*   r(   <module>r      s  8'R 
  ! 5 < <	 	 	 , 

  (- 8 z[ABF-3-I-I-K*K
 !+ 0 0N BJPUWaWgWgHhE&***&::'M#
 2*=&8K8M''5AT''(=>
 DGTG&&2&>G	3 " kk+,G; nn./G kk.)GG$M&3&A&AT&I76?tL
+tCy +` ++)?;#!#J/G MMO }//0  )A)AB !w r*   