
    h2                         S r SSKrSSKrSSKJr  SSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJrJr  SSKJr  \R$                  R'                  S	S
5        S r\S:X  a  \" 5         gg)z
pip install math_verify num2words peft trackio vllm
export TRACKIO_PROJECT="RLOO-NuminaMath-TIR"
accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/rloo.py
    N)load_dataset)NormalizationConfig)LatexExtractionConfigparseverify)
LoraConfig)
RLOOConfigRLOOTrainer)think_format_rewardTRACKIO_SPACE_IDztrl-trackioc                    ^ [        SSS/S9u  pSmU4S jnU R                  USS/S	9n UR                  USS/S	9nS
[        [           4S jn[	        SS[
        R                  0S[        SS9SSSSSSSSSSS9n[        SU[        U/U U[        5       S9nUR                  5         UR                  UR                  5        UR                  SS9  g )NzAI-MO/NuminaMath-TIRz
train[:5%]z	test[:5%])splita[  A conversation between user and assistant. The user asks a question, and the assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think></think> tags, i.e., <think>
This is my reasoning.
</think>
This is my answer.c                 "   > SSTS.SU S   S./0$ )Npromptsystem)rolecontentuserproblem )exampleSYSTEM_PROMPTs    O/home/james-whalen/.local/lib/python3.13/site-packages/examples/scripts/rloo.pymake_conversationmain.<locals>.make_conversation<   s*    !m<GI,>?
 	
    messagesr   )remove_columnssolutionc                 6   / nU  Vs/ s H
  oDS   S   PM     nn[        XQ5       H  u  pg [        USS9n[        U5      S:w  a6   [        U[	        [        SSSSSS9SSS	9/SS
9n	[        [        X5      5      n
OF[        UR                  5       R                  5       UR                  5       R                  5       :H  5      n
UR                  U
5        M     U$ s  snf ! [         a    / n Nf = f! [         a   n[        SU SU SU 35        Sn
 SnANSSnAff = f)u   Reward function that checks if the completion matches the ground truth.
- If both gold and prediction are parseable → use math verification.
- If not parseable → compare as normalized text.
r   r   first_match)extraction_modeFTall)nitsmalformed_operatorsbasic_latexboxedunits)normalization_configboxed_match_prioritytry_extract_without_anchor)extraction_configr"   zverify failed: z
, answer: z, gold: N)zipr   	Exceptionlenr   r   floatr   printstriplowerappend)completionsr   kwargsrewards
completioncontentsr   solgold_parsedanswer_parsedrewardes               r   accuracy_rewardmain.<locals>.accuracy_rewardH   s;   
 ?JK{qM),{K3LG!#CG ;1$"$)15H).8=04*/*.6" 67;@
+ )6%M" #6+#EFF w}}446#))+:K:K:MMNNN6"E 4H K L  ! !0 ! "OA3j	#OP!F"s.   C
C4C.C+*C+.
D8DDzQwen3-0.6B-RLOOdtypegh㈵>F)use_reentrantT   i   i      colocateg      ?zQwen3-0.6B-RLOO-NuminaMath-TIR)
output_dirmodel_init_kwargslearning_rategradient_checkpointing_kwargslog_completionsnum_completions_to_printmax_prompt_lengthmax_completion_lengthgradient_accumulation_stepssteps_per_generationuse_vllm	vllm_modevllm_gpu_memory_utilizationrun_namezQwen/Qwen3-0.6B)modelargsreward_funcstrain_dataseteval_datasetpeft_config)dataset_name)r   mapliststrr	   torchbfloat16dictr
   r   r   train
save_modelrF   push_to_hub)rW   rX   r   r?   training_argstrainerr   s         @r   mainrf   1   s   "./El\gMh"iM	3 
 "%%&7U^H_%`M##$5zS\F]#^L+tCy +\ $"ENN3&*&?!""$%$'1M" )?;#!LG MMO }//0%;<r   __main__)__doc__osr^   datasetsr   latex2sympy2_extendedr   math_verifyr   r   r   peftr   trlr	   r
   trl.rewardsr   environ
setdefaultrf   __name__r   r   r   <module>rs      sY   6 
  ! 5 < <  ' + 

  (- 8c=L zF r   