
    f:i             
          S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJrJrJ
r
JrJrJrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&JrJ'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJBrBJCrCJDrDJrJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrVJWrWJrJXrXJYrYJZrZJ[r[JrJrJrJ)r)J*r*J+r+J1r1J2r2J7r7J9r9J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJKrKJNrNJPrPJrJXrXJYrYJ[r[J0r0JFrFJGrGJPrPJRrRJrJXrXJPrPJrJrJOrOJPrPJTrTJUrUJVrVJrJZrZJ
r
JrJ%r%J?r?JFrFJPrPJrJrJrJFrFJPrPJrJrJPrPJrJrJ%r%J1r1JFrFJPrPJr  SSKFrFSSK7  SSK\J]r]J^r^  SS	K_J`r`  SSKrSSKarbSS
KcJErE  SSKJr  SSKXJdrdJerf  SSKgJhrh  SSKiriSSKjJkrk  S rl SSSSSS.rm\R                  " SS\mS9S 5       roS\R                  S\pS\pS\R                  4S jrqS\R                  S\R                  S\pS\pS\R                  4
S jrrS\R                  S\pS\R                  4S jrsS  rt " S! S"\R                  R                  5      rw S0S# jrx\R                  " SS\mS9S$ 5       ryS% rz\] " S& S'\5      5       r{  " S( S)\%5      r| " S* S+\|5      r} \~" \?S,5      (       a4  SSK@r@ " S- S.\@R                  5      r \?GR                  " \" S/5      5        gg)1z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)r   
AutoConfig"AutoModelForSequenceClassificationAutoProcessorAutoTokenizer
DataLoaderDatasetFSDP
GRPOConfigGRPOTrainerGenerationConfigIterableDatasetr   Path
PeftConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinRepeatSampler
RewardFuncSamplerSyncRefModelCallbackTrainerTrainerCallbackr	   
VLLMClient_ForwardRedirectionapply_chat_templatebroadcast_object_listcopydatasetsdefaultdictdequedisable_dropout_in_modelentropy_from_logitsgathergather_objectgenerate_model_cardget_comet_experiment_urlidentityinspectis_conversationalis_datasets_availableis_flash_attn_2_availableis_liger_kernel_availableis_peft_modelis_rich_availableis_vllm_availableis_wandb_availableloggerloggingmaybe_apply_chat_templatenanmaxnanminnanstdnnnullcontextospadpartialprepare_deepspeedprepare_fsdpprepare_multimodal_messagesprepare_peft_modelprint_prompt_completions_sampleprofiling_contextprofiling_decoratorreseed_workerselective_log_softmaxset_seedshuffle_sequence_dictsplit_pixel_values_by_gridsplit_tensor_dicttextwraptorchtransformerstruncate_with_protected_tokensunsplit_pixel_values_by_gridunwrap_model_for_generationr   r   r	   r%   r&   r'   r-   r.   r3   r5   r<   r=   r>   r?   r@   rB   rC   rD   rH   rK   rM   rU   rV   rW   rY   r,   rC   rD   rM   rO   rU   rV   rM   r   r	   rL   rM   rQ   rR   rS   rU   rX   r   r   r!   r;   rC   rM   rU   r   rA   rC   rM   r   rA   rM   rU   r   r!   r-   rC   rM   rU   )*)	dataclassfield)Version)rB   )DataCollatorForSeq2SeqDataCollatorForLanguageModeling)ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrrd   re   rf   )selfargskwargsoutputfs       ?/home/james-whalen/unsloth_compiled_cache/UnslothGRPOTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)rl   rn   s   ` rm   prepare_for_training_moders   /   s%    __Q  Nrp   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   chunksdimr   indexr      )rU   chunkreshapeshapeziptofloat32r-   	unsqueezesqueeze	logsumexpappendconcat)
logitsr   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             rm   chunked_selective_log_softmaxr   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYrp   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
r   z8logits_to_keep must be smaller than the sequence length.Nr   )r   
ValueErrorsum)r   r   r   prompt_sectionpadding_maskpad_token_countss         rm   calculate_pad_tokens_in_promptr   W   sX     ++STTq"2N?"223N"2L#''A'.rp   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
devicer   r   )r   r   rU   aranger   )r   r   r   r   
batch_sizecompletion_lenr   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               rm    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.Jrp   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
r   T)r   
descendingstable)rU   argsortr-   )r   r   masksorted_indicespacked_tensors        rm   left_pack_paddingr      s8     D]]4Q4MNLLN;Mrp   c                 
  ^^$ UR                  SS5      nUR                  SS5      n	UR                  SS5      n
UR                  SS5      nUR                  SS 5      nUR                  S	S
5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  S5      nUS:w  a  X-  nUS:w  a  X-  nUS:w  a  U[        R                  " UU-  5      -  nUR	                  [        R
                  5      nUS
:w  a  X-  n[        R                  " USUS9R                  S5      nU[        R                  " USS9-
  n[        R                  " 5          US:w  a  U c   S5       eUS:w  a  X-  n US:w  a  X-  n US:w  a  U [        R                  " U U-  5      -  n U R	                  [        R
                  5      n US
:w  a  X-  n [        R                  " U SUS9R                  S5      nU[        R                  " U SS9-
  n Ub  US:w  a  X.-  nUS:w  a  X/-  nUS:w  a  U[        R                  " UU-  5      -  nUR	                  [        R
                  5      nUS
:w  a  X--  n[        R                  " USUS9R                  S5      nU[        R                  " USS9-
  n S S S 5         US:w  a#  [        R                  " WU-
  5      UU-
  -
  S
-
  nO>US:X  a"  UR                  UR                  S5      S5      nO[        R                  " U5      nUb  UW-
  nOUUR                  5       -
  nUS:X  a  UnOZUS:X  aE  UT-  R                  S5      TR                  S5      R!                  S
S9-  nUR                  S5      nO[#        SU S35      e[        R                  " U5      n[        R                   " USU	-
  SU
-   5      nUb(  [        R                   " UUS9UR                  S5      -  nOUUR                  S5      -  n UUR                  S5      -  n[        R$                  " UU5      * nUS:w  a  UUU-  -   nTR	                  [        R
                  5      mTR                  S5      m$US:X  aB  UT-  R                  S5      TR                  S5      R!                  S
S9-  R'                  5       n OuUS:X  a2  UT-  R                  5       TR                  5       R!                  S
S9-  n O=US:X  a)  UT-  R                  5       UR                  S5      U-  -  n O[#        SU 35      eUU$4S jn!U!" U5      u  n"n#U U"U#4$ ! , (       d  f       GN= f) N	loss_typegrpoepsilon_low皙?epsilon_highmax_completion_length    deltatemperature      ?logit_scale_multiply        logit_scale_dividelogit_softcappingimportance_sampling_leveltokenr{   r   r   r   .ref_logits should not be None when beta != 0.0sequencer   min#Unknown importance sampling level: -. Possible values are 'token' and 'sequence'.maxbnpodr_grpoUnknown loss type: c                 2  > [         R                  " 5          TR                  5       nU R                  S   S:X  a  XR                  5       4sS S S 5        $ U T-  R	                  S5      T-  nUR                  5       nX4sS S S 5        $ ! , (       d  f       g = fNr   rU   inference_modemeanr   r   xcompletion_lengthmean_kl_per_rewardmean_klr   n_mask_per_rewards       rm   masked_batch_mean,grpo_compute_loss.<locals>.masked_batch_mean  |    !!# 1 6 6 8wwqzQ(&&(2 $#
 '($h^^A%69J%J",113(1 $##   4B)B
Bgetr   rU   tanhr   r   r-   r   r   no_gradexp	new_zerossize
zeros_likedetachr   clampr   r   r   %
ref_logits
new_logits
old_logitsr   r   beta
advantagesrj   r   r   r   r   r   r   r   r   r   r   new_xnewref_xrefold_xoldkl_i	log_ratiolog_importance_weightscoef_1coef_2loss_1loss_2loss_ilossr   r   r   r   s%       `                               @rm   grpo_compute_lossr     sR    

;/I**]C0K::nc2L"JJ'>EJJw%E**]C0K!::&<cB!::&:C@!::&93? &

+F P##B'I q z/P*q z/N*q zEJJzTeGe<f/f*u}}-Jc
(@:LL2yAII"ME
%//*B7
7C	3;)[+[[) $q(z7X*!q(z7V* q(zEJJz\mOmDn7n*#u}}5Jc!
0H:LL2yIQQRTUE%//*B??C!#q(z7X*!q(z7V* q(zEJJz\mOmDn7n*#u}}5Jc!
0H:LL2yIQQRTUE%//*B??C7 
8 	 s{yys#sSy1C7 %
2==!a0D##C(D #I	#**,&	 G+!*	"j	0"+d"2!7!7!;dhhrl>P>PUX>P>Y!Y!7!A!A"!E12K1L M 
 	

 ii./F[[[!l2BCFV/*2F2Fq2II*..q11 j**1--Fii''Fs{$+%775==!D F$##B'$((2,*<*<*<*EEKKM	f	""$txxz'7'7C'7'@@	i	""$A9N(NO.yk:;;
2 "34!8w"G++W 
   9EU((
U7c                   :    \ rS rSr\SS j5       r \S 5       rSrg)UnslothEfficientGRPOi  Nc                   ^^^^^^ ^! Tc  0 mUUU4S jm! UR                   n[        R                  " U5      n[        R                  " SUS9m[        R                  " SUS9m[        R                  " SUS9m UUU U!4S jn [        R                  " USS[
        S9n[        R                  " XSS9n[        R                  " XSS9nUb  [        R                  " X*SS9nOS /U
-  nUb  [        R                  " X:SS9nOS /U
-  n[        R                  " XZSS9n[        R                  " XjSS9n[        R                  " XzSS9nU	b  U	R                  5       OS	n[        UUUUUUU5       H  u  nnnnnnnU" UUUUUUUU5        M      UR                  U
5        TR                  U
5        TR                  U
5        T R                  U
5        U R                  U5        TTT 4$ )
Nc           	        > [         R                  " U R                  TR                  5      TR	                  5       5      nUS S 2S S2S S 24   n[         R
                  " 5          TS:w  aM  [         R                  " UR                  TR                  5      TR	                  5       5      nUS S 2S S2S S 24   nOS nUbM  [         R                  " UR                  TR                  5      TR	                  5       5      n	U	S S 2S S2S S 24   n	OS n	S S S 5        [        WUW	UUTU40 TD6u  pnX-  nXR                  5       X44$ ! , (       d  f       N:= f)Nr{   r   )rU   matmulr   dtypetr   r  r   )new_hidden_statesold_hidden_statesref_hidden_statesr   r   r   scalingr   r   r   r  r   r   scaled_lossr   extra_kwargslm_heads                 rm   compute_loss2UnslothEfficientGRPO.forward.<locals>.compute_loss%  s5   &7&:&:7==&I799;WJ#AssAI.J3;!&.?.B.B7==.QSZS\S\S^!_J!+AssAI!6J!%J$0!&.?.B.B7==.QSZS\S\S^!_J!+AssAI!6J!%J !" 0A	0 	0,DW .K0A LLL? !s   #B(D??
Er   r   c           	         > [         R                  R                  TSSS9" XX#XEU5      u  u  nu  n	u  pnTR                  U
5        TR                  U5        TR                  U5        XS S & g )N)r   T)argnumshas_aux)rU   funcgrad_and_valueadd_)new_hidden_states_jold_hidden_states_jref_hidden_states_jinput_ids_jmask_jadvantages_jr  grad_inputs_jchunk_grad_input
chunk_lossunscaled_losschunk_completion_lengthchunk_mean_klaccumulated_completion_lengthaccumulated_lossaccumulated_mean_klr  s                rm   accumulate_chunk6UnslothEfficientGRPO.forward.<locals>.accumulate_chunkP  s     kpjtjt  kD  kD kD k "8KZ`pw	kyg!g*.f}Wd
 ..}=)../FG..}=/!rp   T)rx   rw   ry   r   r}   r   )r   rU   
empty_likezeroscompiletorch_compile_optionsr   	get_scaler   div_save_for_backward)"ctx_new_hidden_states_old_hidden_states_ref_hidden_statesr  
_input_ids_mask_advantagesr   scalern_chunksr  r   grad_inputsr.  grad_inputs_chunksr  r  r  r   r   r   r  r%  r  r   r!  r"  r#  r$  r+  r,  r-  r  s"       `   `  `                  @@@@rm   forwardUnslothEfficientGRPO.forward!  s   L"	MF 	"))&&'9:(-A(G(-A(G%(-A(G	0 	0& 	 ==+
 #[[VWX"[[);VWX)!&-?Z[!\!% 1)!&-?Z[!\!% 1"[[VWX	"[[VWX"[[VWX
 )/(:&""$ "$57HJ[]fhlnxy @]/1DFY\gioq} ###	 z* 	**84**84%**84**84k*)
 	
rp   c                 8    U R                   u  nUS S S S S S S S S S 4$ N)saved_tensors)r7  grad_outputdcompletion_lengthdmean_kl
grad_inputs        rm   backwardUnslothEfficientGRPO.backward  s-    ))D$dD$dDRVWWrp    )Nr   N)__name__
__module____qualname____firstlineno__staticmethodrB  rK  __static_attributes__rM  rp   rm   r
  r
    s5    @
 @
B 	X X 	rp   r
  c	                 N   UR                   u  pU	R                  SS 5      nU	R                  SS 5      nU	R                  SS 5      nU	R                  SS 5      n[        SU
S-   5       Vs/ s H  nU
U-  S:X  d  M  UPM     nnUS:X  a  U
nU[        [        R
                  " UU5      [        U5      S-
  5         n[        U S5      (       dt  [        R                  R                  S	S
5      S
:X  a  [        R                  O[        R                  U l        [        R                  R                  SS5      S:X  a  S U l         S[        R                  S'   U R                  R                  5       R                   nUc  [#        XU R$                  R&                  5      n[)        U5      R+                  5       n[-        XR$                  R&                  5      nUS S 2UU-   * S 24   n[/        UUUU R$                  R&                  5      R1                  UR2                  5      nXR$                  R&                  :g  nUR1                  UR2                  5      nOUS S 2U* S 24   nU R4                  R7                  U R                  SS9nUR9                  5        HD  n[        US5      (       d  M  [        UR:                  S5      (       d  M3  SUR:                  l        MF      U R                  c  [?        5       nOF[        R@                  RC                  U R                  RD                  RF                  U R                  S9nU   Uc[  U" UUUUUUS9RH                  nUS S 2UW-   S-   * S 2S S 24   nUb  US S 2UU-   S-   * S 2S S 24   nUb  US S 2UU-   S-   * S 2S S 24   nOU" UUUUUUUS-   S9RH                  nS S S 5        [J        RM                  WUUUUUUU RN                  U R4                  RP                  UU	5      u  nnnS[        R                  S'   UUU4$ s  snf ! , (       d  f       Nl= f)Npixel_valuesimage_grid_thwpixel_attention_maskimage_sizesr   r   r{   _autocast_dtypeACCELERATE_MIXED_PRECISIONfp16UNSLOTH_FORCE_FLOAT3201UNSLOTH_RETURN_HIDDEN_STATESFkeep_fp32_wrapper_hf_hookio_same_decicedevice_typer  r   attention_maskrU  rV  rW  rX  r   rg  rU  rV  rW  rX  r   ),r   r   ranger   npsearchsortedlenrg   rC   environrU   float16bfloat16rY  rd   get_output_embeddingsweightr   processing_classr   r   itemr   r   r   r  acceleratorunwrap_modelmodulesrb  rc  rB   ampautocastr   typer   r
  applyr   r>  r  r  r  )trainerr   rg  r   completion_maskr   r  r  r?  rj   bszqlenrU  rV  rW  rX  ifactorsr  r   r   r   unwrapped_modelmodule
autocasterr  r  r   r   r   r   s                                  rm   grpo_accumulated_lossr    sF    IC::nT2LZZ 06N!::&<TB**]40K37+<+QsQw!|q+G<2~#xs2??7H=s7|A~NOH7-..35::>>B^`f3gkq3q%--w|  xF  xF::>>1373>Z^@W14BJJ-.mm113::G%CI_f_w_w  `E  `E  &F"56;;=%i1I1I1V1VW	(nl.J,K,L)LM:;OQkmy  |C  |T  |T  |a  |a  b  e  e  ft  fz  fz  {#'?'?'L'LL'**>+?+?@(^O,<)<=))66w}}Z_6`O "))+6:&&76??DT+U+U-2FOO* , 	 & ]
YY''gmm6J6J6O6OY`YpYp'q
	 /%!/+!/';)! f  !2!~|7STU7U5V5WZ[2[ \ ,$5a><;WXY;Y9Z9[^_6_$`! ,$5a><;WXY;Y9Z9[^_6_$`! /%!/+!/';)!/!!3! f ' 
: (<'A'A""($D
W 25BJJ-."G++m =P 
s   (P9P8A8P
P$c                 
  ^^$ UR                  SS5      nUR                  SS5      n	UR                  SS5      n
UR                  SS5      nUR                  SS 5      nUR                  S	S
5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  S5      nUS:w  a  X-  nUS:w  a  X-  nUS:w  a  U[        R                  " UU-  5      -  nUR	                  [        R
                  5      nUS
:w  a  X-  n[        R                  " USUS9R                  S5      nU[        R                  " USS9-
  n[        R                  " 5          US:w  a  U c   S5       eUS:w  a  X-  n US:w  a  X-  n US:w  a  U [        R                  " U U-  5      -  n U R	                  [        R
                  5      n US
:w  a  X-  n [        R                  " U SUS9R                  S5      nU[        R                  " U SS9-
  n Ub  US:w  a  X.-  nUS:w  a  X/-  nUS:w  a  U[        R                  " UU-  5      -  nUR	                  [        R
                  5      nUS
:w  a  X--  n[        R                  " USUS9R                  S5      nU[        R                  " USS9-
  n S S S 5         US:w  a#  [        R                  " WU-
  5      UU-
  -
  S
-
  nO>US:X  a"  UR                  UR                  S5      S5      nO[        R                  " U5      nUb  UW-
  nOUUR                  5       -
  nUS:X  a  UnOZUS:X  aE  UT-  R                  S5      TR                  S5      R!                  S
S9-  nUR                  S5      nO[#        SU S35      e[        R                  " U5      n[        R                   " USU	-
  SU
-   5      nUb(  [        R                   " UUS9UR                  S5      -  nOUUR                  S5      -  n UUR                  S5      -  n[        R$                  " UU5      * nUS:w  a  UUU-  -   nTR	                  [        R
                  5      mTR                  S5      m$US:X  aB  UT-  R                  S5      TR                  S5      R!                  S
S9-  R'                  5       n OuUS:X  a2  UT-  R                  5       TR                  5       R!                  S
S9-  n O=US:X  a)  UT-  R                  5       UR                  S5      U-  -  n O[#        SU 35      eUU$4S jn!U!" U5      u  n"n#U U"U#4$ ! , (       d  f       GN= f) Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   c                 2  > [         R                  " 5          TR                  5       nU R                  S   S:X  a  XR                  5       4sS S S 5        $ U T-  R	                  S5      T-  nUR                  5       nX4sS S S 5        $ ! , (       d  f       g = fr   r   r   s       rm   r   1grpo_compute_loss_slow.<locals>.masked_batch_mean  r   r   r   r   s%       `                               @rm   grpo_compute_loss_slowr  &  sR    

;/I**]C0K::nc2L"JJ'>EJJw%E**]C0K!::&<cB!::&:C@!::&93? &

+F P##B'I q z/P*q z/N*q zEJJzTeGe<f/f*u}}-Jc
(@:LL2yAII"ME
%//*B7
7C	3;)[+[[) $q(z7X*!q(z7V* q(zEJJz\mOmDn7n*#u}}5Jc!
0H:LL2yIQQRTUE%//*B??C!#q(z7X*!q(z7V* q(zEJJz\mOmDn7n*#u}}5Jc!
0H:LL2yIQQRTUE%//*B??C7 
8 	 s{yys#sSy1C7 %
2==!a0D##C(D #I	#**,&	 G+!*	"j	0"+d"2!7!7!;dhhrl>P>PUX>P>Y!Y!7!A!A"!E12K1L M 
 	

 ii./F[[[!l2BCFV/*2F2Fq2II*..q11 j**1--Fii''Fs{$+%775==!D F$##B'$((2,*<*<*<*EEKKM	f	""$txxz'7'7C'7'@@	i	""$A9N(NO.yk:;;
2 "34!8w"G++W 
r  c                  .    SSK Jn  U" S0 U D6nXl        U$ )Nr   )SamplingParamsrM  )vllmr  _set_kwargs)rj   r  sampling_paramss      rm   vLLMSamplingParamsr    s    #$.v.O"(rp   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'   SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS S SSSSS!S"SSSSSSSS#SSSSSSSSSSSSSS#SSSSSSS$S%SSSS&SSSSSSSSSSSS'SSSSSSSSS"SSSS#SSSS(S)SSSSSSSSSSSSSSSS*S+S,SSSSSSSS0 SSSSS-S.SSSS/S0S1S2SS3SS4SSS5SS6S7SSS8S*SSSS9SSSSS4U 4S: jjrS;rU =r$ )<UnslothGRPOConfigi  u&@  
    
Configuration class for the [`GRPOTrainer`].

This class includes only the parameters that are specific to GRPO training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    > Parameters that control the model and reference model

    model_init_kwargs (`str`, `dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
        argument of the [`GRPOTrainer`] is provided as a string.
    disable_dropout (`bool`, *optional*, defaults to `False`):
        Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
        the model from generating different logprobs for the same input.

    > Parameters that control the data preprocessing

    remove_unused_columns (`bool`, *optional*, defaults to `False`):
        Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
        requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
    max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
        Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
    num_generations (`int` or `None`, *optional*, defaults to `8`):
        Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size
        * gradient_accumulation_steps) must be evenly divisible by this value.
    max_completion_length (`int` or `None`, *optional*, defaults to `256`):
        Maximum length of the generated completion.
    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
        improving generation speed. However, disabling this option allows training models that exceed the VRAM
        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
        with vLLM generation.
    shuffle_dataset (`bool`, *optional*, defaults to `True`):
        Whether to shuffle the training dataset.

    > Parameters that control generation

    generation_batch_size: (`int` or `None`, *optional*, defaults to `None`):
        Batch size to use for generation. If `None`, it defaults to the effective training batch size:
        `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one
        generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`.
    steps_per_generation: (`int` or `None`, *optional*, defaults to `None`):
        Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive
        with `generation_batch_size`.
    temperature (`float`, defaults to `1.0`):
        Temperature for sampling. The higher the temperature, the more random the completions.
    top_p (`float`, *optional*, defaults to `1.0`):
        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
        `1.0` to consider all tokens.
    top_k (`int` or `None`, *optional*, defaults to `None`):
        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
        disabled and all tokens are considered.
    min_p (`float` or `None`, *optional*, defaults to `None`):
        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
    repetition_penalty (`float`, *optional*, defaults to `1.0`):
        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
        tokens.
    use_transformers_paged (`bool`, *optional*, defaults to `False`):
        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
        paged implementation will be used for generation instead of the default padded implementation. This
        parameter is only effective when `use_vllm` is set to `False`.
    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
        as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
        parameters (like `min_p`, `top_p`, etc.), they will override them.

    > Parameters that control generation acceleration powered by vLLM

    use_vllm (`bool`, *optional*, defaults to `False`):
        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
        instead of the default model.generate(). Requires `vllm` to be installed.
    vllm_mode (`str`, *optional*, defaults to `"server"`):
        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
        `"colocate"`.

        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
          server is running (start with `trl vllm-serve`).
        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
          separate server but may cause resource contention with training.
    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
        implementation.
    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.

    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)

    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
        `vllm_server_port` are ignored.
    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_port (`int`, *optional*, defaults to `8000`):
        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
        timeout, a `ConnectionError` is raised.

    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)

    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`):
        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.
    vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`):
        Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken
        for weight sync and generation.

    > Parameters that control the training

    beta (`float`, *optional*, defaults to `0.0`):
        KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and improving
        training speed.
    num_iterations (`int`, *optional*, defaults to `1`):
        Number of iterations per batch (denoted as μ in the algorithm).
    epsilon (`float`, *optional*, defaults to `0.2`):
        Epsilon value for clipping.
    delta (`float` or `None`, *optional*, defaults to `None`):
        Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` (default), standard
        GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This method is introduced in
        the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291).
    epsilon_high (`float` or `None`, *optional*, defaults to `None`):
        Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound
        specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`.
    importance_sampling_level (`str`, *optional*, defaults to `"token"`):
        Controls whether importance sampling ratios are computed at the `"token"` or `"sequence"` level. `"token"`
        keeps the raw per-token log-probability ratios (one weight per token). `"sequence"` averages the
        log-probability ratios across valid tokens to produce a single ratio per sequence. The [GSPO
        paper](https://huggingface.co/papers/2507.18071) shows that sequence-level sampling often yields more
        stable training and better alignment with sequence-level rewards.
    reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
        Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
        weighted equally with weight `1.0`.
    scale_rewards (`str` or `bool`, *optional*, defaults to `"group"`):
        Specifies the scaling strategy for rewards. Supported values are:

        - `True` or `"group"` (default): rewards are scaled by the standard deviation within each group, ensuring
          unit variance within a group.
        - `"batch"`: rewards are scaled by the standard deviation across the entire batch, as recommended in the
          [PPO Lite paper](https://huggingface.co/papers/2508.08221).
        - `False` or `"none"`: no scaling is applied. The [Dr. GRPO
          paper](https://huggingface.co/papers/2503.20783) recommends not scaling rewards, as scaling by the
          standard deviation introduces a question-level difficulty bias.
    loss_type (`str`, *optional*, defaults to `"dapo"`):
        Specifies the loss formulation to use. Supported values are:

        - `"grpo"`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to
          length bias—this approach tends to prefer shorter completions with positive advantages and longer ones
          with negative advantages.
        - `"dr_grpo"`: Aggregates token-level losses by normalizing with a global constant. This method was
          introduced in the [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) to eliminate length bias.
          The value of the constant corresponds to `max_completion_length`.
        - `"dapo"` (default): Aggregates token-level losses by normalizing with the number of active token in the
          global accumulated batch. This method was introduced in the [DAPO
          paper](https://huggingface.co/papers/2503.14476) to eliminate length bias.
        - `"bnpo"`: Aggregates token-level losses by normalizing with the number of active token in the local
          batch. Note that normalization is performed over the local batch only, so results may slightly vary
          depending on the local batch size, despite a constant effective batch size. When using
          `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss.
    mask_truncated_completions (`bool`, *optional*, defaults to `False`):
        When enabled, truncated completions are excluded from the loss calculation, preventing them from being
        incorrectly penalized and introducing noise during training. According to the
        [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability.
    sync_ref_model (`bool`, *optional*, defaults to `False`):
        Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
        the `ref_model_mixup_alpha` parameter. This synchronization originates from the
        [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
    ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
        α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
        between the current policy and the previous reference policy during updates. The reference policy is
        updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
        must set `sync_ref_model=True`.
    ref_model_sync_steps (`int`, *optional*, defaults to `512`):
        τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
        frequently the current policy is synchronized with the reference policy. To use this parameter, you must
        set `sync_ref_model=True`.
    top_entropy_quantile (`float`, *optional*, defaults to `1.0`):
        ρ parameter from [Beyond the 80/20 Rule](https://huggingface.co/papers/2506.01939). Keeps in the policy
        loss term only the top-ρ quantile of tokens by entropy of the probability distribution at each sequence
        position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token;
        `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with
        `mask_truncated_completions=True`, only tokens from non-truncated completions are considered.
    use_liger_loss (`bool`, *optional*, defaults to `False`):
        Whether to use the Liger GRPO loss.
    vllm_importance_sampling_correction (`bool`, *optional*, defaults to `True`):
        Whether to apply Truncated Importance Sampling (TIS) between vLLM completion logprobs and recomputed
        logprobs. [Your Efficient RL Framework Secretly Brings You Off-Policy RL
        Training](https://fengyao.notion.site/off-policy-rl) highlights that using a separate generation framework
        (such as vLLM) can introduce off-policy effects due to subtle implementation differences between generation
        and training backends. TIS is proposed as a remedy for this issue.
    vllm_importance_sampling_cap (`float`, *optional*, defaults to `2.0`):
        Truncation parameter C for Truncated Importance Sampling (TIS). This sets an upper bound on the importance
        sampling ratio, improving training stability.

    > Parameters that control the logging

    log_completions (`bool`, *optional*, defaults to `False`):
        Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed,
        it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
    num_completions_to_print (`int` or `None`, *optional*, defaults to `None`):
        Number of completions to print with `rich`. If `None`, all completions are logged.
    wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`):
        Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts
        are logged.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsr{   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksFnor|      r      g-C6
?g{Gz?g?g+?g:0yE>r   g      @linear皙?passivewarningTstepsr     iO  O1auto r   
adamw_8bitlength
every_savelasti           colocater  z0.0.0.0i@  g      n@g333333?gMbP?r   r   groupr   g333333?g       @c                   > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#WR                  5       S
:X  a  S
nOWR                  5       S:X  a  SnWR                  5       S
:X  a  WS :X  a  SnOcWS:X  a  [        S5        SnOOWR                  5       S:X  a;  WS:w  a  [        S5        WS:w  a  [        S5        WS:w  a  [        S5        SnSnSnSnUW-  U-  U:w  a(  [        S[        U5      -   S-   [        W5      -   5        UnWS::  a  [        S5      eWS:  a  [        S5      e[        TU ]  " S0 SU_SU_SU_SU_SU_S U_S!U_S"U_S#U	_S$U
_S%U_S&U_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U_S2U_S3U_S4U_S5U_S6U_S7U_S8U_S9U_S:U _S;U!_S<U"_S=U#_S>U$_S?U%_S@U&_SAU'_SBU(_SCU)_SDU*_SEU+_SFU,_SGU-_SHU._SIU/_SJU0_SKU1_SLU2_SMU3_SNU4_SOU5_SPU6_SQU7_SRU8_SSU9_STU:_SUU;_SVU<_SWU=_SXU>_SYU?_SZW@_S[WA_S\WB_S]WC_S^WD_S_WE_S`WF_SaWG_SbWH_ScWI_SdWJ_SeWK_SfWL_SgWM_ShWN_SiWO_SjWP_SkWQ_SlWR_SmWS_SnWT_SoWU_SpWV_SqWW_SrWX_SsWY_StWZ_SuW[_SvW\_SwW]_SxW^_SyW__SzW`_S{Wa_S|Wb_S}Wc_S~Wd_SWe_SWf_SWg_SWh_SWi_SWj_SWk_SWl_SWm_SWn_SWo_SWp_SWq_SWr_SWs_SWt_SWu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l        g )NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!r   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!r  r  unsloth_training_checkpointsr  r   dapoTzwUnsloth: The Dr GRPO paper recommends setting `scale_rewards` to False! Will override. Set it to `None` to force False.FzFUnsloth: The DAPO paper recommends `mask_truncated_completions = True`gQ?z8Unsloth: The DAPO paper recommends `epsilon_high = 0.28`r   zMUnsloth: The DAPO paper recommends setting `beta = 0.0` to remove the KL termr   z}Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of z to the `num_generations` of r   zUUnsloth: Please set a positive non-zero temperature since your results will be wrong.
   zgUnsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16r[  fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesmodel_init_kwargsdisable_dropoutmax_prompt_lengthnum_generationsr   ds3_gather_for_generationshuffle_datasetgeneration_batch_sizesteps_per_generationr   top_ptop_kmin_pgeneration_kwargsrepetition_penaltyuse_transformers_pagedcache_implementationuse_vllm	vllm_modevllm_model_implvllm_enable_sleep_modevllm_guided_decoding_regexvllm_server_base_urlvllm_server_hostvllm_server_portvllm_server_timeoutvllm_gpu_memory_utilizationvllm_tensor_parallel_sizer   num_iterationsepsilonr   r   r   reward_weightsscale_rewardsr   mask_truncated_completionssync_ref_modelref_model_mixup_alpharef_model_sync_stepstop_entropy_quantileuse_liger_loss#vllm_importance_sampling_correctionvllm_importance_sampling_caplog_completionsnum_completions_to_printwandb_log_unique_promptsrM  )printlowerstr	MathErrorsuper__init__r  r  )rh   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r[  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r1  r2  r   r3  r4  r5  r6  r   r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  r   rI  rJ  r   r   r   rK  rL  r   rM  rN  rO  rP  rQ  rR  rS  rT  rU  rV  rW  r  r  rj   	__class__s                                                                                                                                                                                        rm   r]  UnslothGRPOConfig.__init__  sk   p 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M??	)!I__&(I??	)$ $$&  P  Q %__&()T1^_t#PQs{ef)-&LDI'?:oMQll  S  VY  Zu  Vv  v  yX  X  [^  _n  [o  o  p*9'!sttB  F  G  G 	 q	J#q	J#7q	J  q	J 	q	J
 $q	J *q	J $8q	J +Fq	J *Dq	J (@q	J '>q	J +Fq	J '>q	J $q	J '>q	J  *!q	J" (#q	J$ $%q	J& $'q	J( ()q	J* *+q	J,  0-q	J. "/q	J0 !21q	J2 (3q	J4 (5q	J6 "7q	J8 !29q	J:  0;q	J< &=q	J>  0?q	J@ "4Aq	JB *Cq	JD &<Eq	JF *Gq	JH $Iq	JJ  0Kq	JL  0Mq	JN !2Oq	JP .Qq	JR 7^Sq	JT Uq	JV Wq	JX ,Yq	JZ [q	J\ "]q	J^ *_q	J`  aq	Jb cq	Jd eq	Jf ,gq	Jh &<iq	Jj ,kq	Jl ,mq	Jn oq	Jp $qq	Jr &sq	Jt *uq	Jv !2wq	Jx yq	Jz $8{q	J| $}q	J~ &<q	J@ *DAq	JB $Cq	JD  Eq	JF (Gq	JH %:Iq	JJ &Kq	JL &<Mq	JN %:Oq	JP !2Qq	JR  0Sq	JT Uq	JV #6Wq	JX &Yq	JZ 2T[q	J\ "4]q	J^ "4_q	J` "aq	Jb &<cq	Jd eq	Jf $gq	Jh "iq	Jj .kq	Jl "4mq	Jn "oq	Jp *Dqq	Jr !2sq	Jt %:uq	Jv %:wq	Jx -Jyq	Jz #6{q	J| *D}q	J~ &q	J@ &<Aq	JB (Cq	JD (Eq	JF "Gq	JH  0Iq	JJ .Kq	JL (Mq	JN &<Oq	JP -JQq	JR *DSq	JT &<Uq	JV (Wq	JX $8Yq	JZ (@[q	J\ !2]q	J^ *_q	J` $8aq	Jb  0cq	Jd &eq	Jf "gq	Jh &iq	Jj *kq	Jl %:mq	Jn "4oq	Jp )Bqq	Jr -Jsq	Jt #6uq	Jv $8wq	Jx "4yq	Jz *{q	J|  0}q	J~ #6q	J@ &<Aq	JB -JCq	JD !2Eq	JF .Gq	JH !2Iq	JJ .Kq	JL %:Mq	JN )BOq	JP .Qq	JR %:Sq	JT $8Uq	JV &Wq	JX Yq	JZ [q	J\ ]q	J^ !2_q	J` "4aq	Jb &<cq	Jd $8eq	Jf  gq	Jh "iq	Jj .kq	Jl &<mq	Jn *Doq	Jp $8qq	Jr  0sq	Jt  0uq	Jv #6wq	Jx +Fyq	Jz )B{q	J| }q	J~ ,q	J@ Aq	JB Cq	JD (Eq	JF )BGq	JH ,Iq	JJ *Kq	JL "Mq	JN *DOq	JP ,Qq	JR %:Sq	JT $8Uq	JV $8Wq	JX ,Yq	JZ 3V[q	J\ ,H]q	J^ ._q	J` (@aq	Jb (@&cq	Jd %9!"4rp   )r  r  )rN  rO  rP  rQ  __doc__r\   r  r   r   __annotations__r  intr]  rS  __classcell__r^  s   @rm   r  r    sz   ^~ +012+(3-  */VW*#  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) %!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(,  #$( $# !&# !&%)#$#&)$%$+%* #"".3'*#'#(#iP5 P5rp   r  c                   @  ^  \ rS rSrSrSS/r        S8S\\\4   S\\	\
\	   4   S\\   S	\\\\4      S
\\\\\\\\\4   4   4      S\\\\4      S\\\\
\   4      S\\
\      S\\\R*                  R,                     \\R*                  R.                  R0                     4   S\S   4U 4S jjjrS rS rS9S\\   S\4S jjrS\4S jr\    S:S j5       r S\RB                  S\RB                  S\"S\RB                  4S jr#  S;S jr$S9S\\
\      4S jjr%S<S \&RN                  S!\4S" jjr(S \&RN                  4S# jr)S$ r*\S%\\\\RB                  \+4   4   S\\\\RB                  \+4   4   4S& j5       r,\U 4S' j5       r-S(\
\\\\RB                  \+4   4      S\\\\RB                  \+4   4   4U 4S) jjr.S* r/S=S+ jr0S, r1S9S-\\
\      4S. jjr2S9S/\\\"4   S0\\"   SS4U 4S1 jjjr3U 4S2 jr4   S>S3\\   S4\\   S5\\\
\   S4   4S6 jjr5S7r6U =r7$ )?_UnslothGRPOTraineri;  a  
Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
Models](https://huggingface.co/papers/2402.03300).

Example:

```python
from datasets import load_dataset
from trl import GRPOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")
def reward_func(completions, **kwargs):
    # Dummy reward function that rewards completions with more unique letters.
    return [float(len(set(completion))) for completion in completions]
trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=reward_func,
    train_dataset=dataset,
)

trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function, such as:
            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
            path to a *directory* containing model weights saved using
            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
            keyword arguments in `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
            - A custom reward function: The function is provided with the prompts and the generated completions,
              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
              functions can also return `None` when the reward is not applicable to those samples. This is useful
              for multi-task training where different reward functions apply to different types of samples. When a
              reward function returns `None` for a sample, that reward function is excluded from the reward
              calculation for that sample. For more details, see [Using a custom reward
              function](#using-a-custom-reward-function).

              The trainer's state is also passed to the reward function. The trainer's state is an instance of
              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
              reward function's signature.
        - A list of reward functions, where each item can independently be any of the above types. Mixing different
        types within the list (e.g., a string model ID and a custom reward function) is allowed.
    args ([`GRPOConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
        ignored. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. The padding side must be set to "left". If `None`, the
        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
        `tokenizer.eos_token` will be used as the default.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
        `None`, the tokenizer for the model is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
        are ignored.
    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
trlr   Nrd   reward_funcsri   train_dataseteval_datasetrr  reward_processing_classes	callbacks
optimizerspeft_configr   c                 "  >^ [        US5      (       a0  [        TS5      (       a  [        TSS5      S:X  a  STl        STl        TcO  [	        U[
        5      (       a  UOUR                  R                  nUR                  S5      S   n[        U S35      mTR                  =(       d    0 n[	        U[
        5      (       a  UnUR                  S	5      n[	        U[        R                  5      (       d	  US
:X  d  Uc  O9[	        U[
        5      (       a  [        [        U5      nXS	'   O[        SU S35      e[        R                   " U5      n[        ["        UR$                  S   5      nUR                   " U40 UD6nO9UR                  R                  nTR                  b  [&        R(                  " S5        [        US5      (       d8  [*        R,                  " UR.                  5      R0                  R3                  5       OE[*        R,                  " UR5                  5       R.                  5      R0                  R3                  5       U l         Uc*  [:        R                   " UR                  R                  5      n[	        U[<        5      (       a  UR>                  nO#[	        U[@        5      (       a  UnO[C        S5      eURD                  c  URF                  Ul"        URD                  U l"        URH                  U l$        URJ                  U l%        [        USS 5      U l&        [        USS 5      U l'        [        UR                  SS 5      U l(        [        UR                  SS 5      U l)        [	        U[T        5      (       d  U/n/ U l+        [Y        U5       H  u  nn[	        U[
        5      (       a  [Z        R                   " U4SS0UD6UU'   [	        UU   [\        R^                  5      (       aF  U RV                  Ra                  UU   R                  R                  R                  S5      S   5        M  U RV                  Ra                  UU   Rb                  5        M     X l2        TRf                  b  [i        TRf                  5      [i        U5      :w  a.  [        S[i        TRf                  5       S[i        U5       S35      e[        Rj                  " TRf                  [        Rl                  S9U l3        O1[        Rn                  " [i        U5      [        Rl                  S9U l3        Uc  S /[i        U5      -  nO[	        U[T        5      (       d  U/n[i        U5      [i        U5      :w  a$  [        S[i        U5       S[i        U5       S35      e[Y        [q        Xr5      5       H  u  nu  nn[	        U[r        5      (       d  M   Uc*  [t        R                   " UR                  R                  5      nURH                  c  URF                  Ul"        URH                  UR                  l$        UUU'   M     Xpl;        TRx                  U l<        TRz                  U l=        TR|                  U l>        TR~                  U l?        TR                  U l@        TR                  U lA        TR                  U lB        TR                  U lC        TR                  U lD        TR                  U l        TR                  U l        TR                  U lE        TR                  U lF        TR                  U lG        TR                  U lH        TR                  U lI        TR                  U lJ        TR                  U lK        TR                  U lL        TR                  U lM        TR                  U lN        U R                  (       a  U R                  S:  a  [        S5      eU R                  (       a  U R                  S :X  d  [        S!5      eTR                  U lP        [	        U[        5      (       dO  [	        U[        5      (       d:  [	        U[        5      (       a0  [        S" UR                  5        5       5      (       a  [        S#5      eTR                  U lU        TR                  U lW        TR                  b  TR                  OTR                  U lX        SU lY        S U lZ        SUR                  S$'   [        TU G]u  UT[        UUUUU	S%S&9	  TR                  U l_        U R                  S':X  a  S U l`        Oc[        U5      (       a  S U l`        OK[        R                   " U5      n[        ["        UR$                  S   5      nUR                   " U40 UD6U l`        TR                  (       a-  [        U5        U R                  b  [        U R                  5        U R                  (       a  [        5       (       d  [        S(5      e[        5       U lg        [        U R                  U R                  U R                  U R~                  U R                  S':g  U R                  U Rz                  S)9U li        [        [T        5      [        [T        5      S*.U lk        SU ll        TR                  U lm        TR                  U ln        TR                  U lo        [        TR                  S+9[        TR                  S+9[        TR                  S+9[        U4S, j5      [        TR                  S+9S-.U lr        [        TR                  SS.9  U R                  (       Ga  [        5       (       d  [        S/5      eU R                  S0:X  a  U R                  R                  (       a  TR                  b  TR                  nOS1TR                   S2TR                   3n[        UTR                  S39U l}        U R                  R                  [        R                  GR                  5       S49  GOU R                  S:X  Ga  U R                  GR                  U R                  -  S:X  d1  [        S5U R                   S6U R                  GR                   S735      eU R                  S:  a  [        GR                  GR                  G[	        U R                  GR                  U R                  -  5       Vs/ s H6  n[U        G[	        UU R                  -  US-   U R                  -  5      5      PM8     sn5      u  U l        n[        U R                  GR                  5      G[        GR                  S8'   [        U R                  GR                  5      G[        GR                  S9'   [        U R                  GR                  5      G[        GR                  S:'   G[        GR                  R                  S;S<5      G[        GR                  S;'   G[        GR                  R                  S=S>5      G[        GR                  S='   U Rx                  b'  U Rz                  b  U Rx                  U Rz                  -   nOS nUGR                  U l        U GR                  GR                  (       a  U GR                  GR                  SS?9  O[        S@U R                   SA35      eTGR                  U l        SU l        U R                  GR%                  5         OU Rz                  SURH                  UGR&                  URJ                  U R~                  U R                  U R                  U R                  U R                  TGR(                  SB.nTR                  (       a  SCUSD'   SEUSF'   SGUSH'   TGR*                  b  UGR-                  TGR*                  5        G[/        SL0 UD6U l        SU l        U GR4                  GR7                  U GR8                  5        U R                  b  U GR:                  (       a'  G[=        U R                  U R                  5      U l`        OcU GR>                  (       a'  G[A        U R                  U R                  5      U l`        O*U R                  GRC                  U R                  SSI9U l`        TGRD                  (       a/  U GRG                  G[I        U R                  U R                  SJ95        [Y        U Rd                  5       H~  u  nn[	        U[r        5      (       d  M  U GR:                  (       a&  G[=        UU R                  5      U Rd                  U'   MU  U R                  GRC                  USSSK9U Rd                  U'   M     g s  snf )MNvllm_enginer>  FTr  /r{   z-GRPOr  r  zInvalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .r   zYou passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.get_base_modelzWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`image_tokenimage_token_idvision_start_token_idvision_end_token_id
num_labelsr   zNumber of reward weights (z)) must match number of reward functions ()r  z)The number of reward processing classes (z-) must match the number of reward functions (z).r   zOLiger Kernels don't currently support masking token positions based on entropy.r   zwLiger Kernels currently only support token-level importance sampling. Please set`importance_sampling_level` to 'token'.c              3   B   #    U  H  n[        U[        5      v   M     g 7frE  )
isinstancer   ).0dss     rm   	<genexpr>/_UnslothGRPOTrainer.__init__.<locals>.<genexpr>N  s     6wav[]z"o7V7Vavs   z^Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead.estimate_tokensz!non-None value to disable scaling)	rd   ri   data_collatorri  rj  rr  rl  rm  compute_loss_funcr   zWLiger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.)r   r   r   r   use_ref_modelr   r   )trainevalmaxlenc                  *   > [        T R                  S9$ )Nr  )r*   r5  )ri   s   rm   <lambda>._UnslothGRPOTrainer.__init__.<locals>.<lambda>  s    58R8R+Srp   )imageprompt
completionrewardsr   )device_specificzkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutr   zvllm_tensor_parallel_size (z) must divide world size (z	) evenly.RANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345levelz6vllm_mode must be either 'server' or 'colocate', got 'z'.)max_new_tokens	do_sampler   bos_token_ideos_token_idr   r7  r8  r9  r;  r=  r  max_batch_tokensi   
num_blocks   
block_size)evaluation_mode)	ref_modelrt  )r  device_placementrM  )rg   getattrr>  r?  r|  rZ  config_name_or_pathsplitr   r/  r   rU   r  r   r   from_pretrainedrV   architecturesr;   r  r2   	signaturerB  
parameterskeysrs  model_kwarg_keysrI   r   r   	tokenizerr   	TypeError	pad_token	eos_tokenr   r  rt  ru  rv  rw  listreward_func_names	enumerater   rA   Moduler   rN  rh  rK  rl  r   r   onesr   r   r   rk  r1  r   r2  r   r7  r8  r9  r;  r<  rG  rH  rS  rT  rR  r   rL  r   rM  rQ  NotImplementedErrorr4  r   dictanyvaluesrI  rJ  r   r   _step_buffered_inputswarnings_issuedr\  r]  r1   r   r  r7   r0  r+   r6   ImportErrorr$   _forward_redirectionLigerFusedLinearGRPOLossliger_grpo_lossr)   _metrics_total_train_tokensrU  rW  rV  r*   r5  _logsrP   r  r9   rt  is_main_processrC  rD  rE  r#   rF  vllm_clientinit_communicatorcudacurrent_devicenum_processesdistributednew_subgroups_by_enumerationri  tp_groupprocess_indexrC   rm  local_process_indexrp  llmri   rA  sleeprB  guided_decoding_regex_last_loaded_stepwait_for_everyoner  r=  r:  updater   generation_configmodel_accepts_loss_kwargsrd   add_model_tags
_tag_namesis_deepspeed_enabledrF   is_fsdp_enabledrG   prepare_modelrN  add_callbackr    )rh   rd   rh  ri   ri  rj  rr  rk  rl  rm  rn  
model_namer/  model_idr  r  architecturer  r  reward_funcreward_processing_classr  _max_model_lenr:  r^  s      `                     rm   r]  _UnslothGRPOTrainer.__init__  ss    5-((WT:-F-Fj%0E9 $%DN<",UC"8"8ell>X>XJ#))#.r2JE23D !228beS!!H%))'2E%--&EME3''u--2'* BBGK 
  //9F"<1E1Ea1HIL 00O=NOE||11H%%1? 5"233 emm,77<<>""5#7#7#9#A#ABMMRRT 	  #,<<U\\=W=WX &77(22I(*ABB(Iuvv&"+"5"5I",,%22%22"#3]DI%&68H$O%,U\\;RTX%Y"#*5<<9NPT#U  ,--(>L!#'5NA{+s++"D"T"T#,-#1B#Q ,q/29955&&--l1o.D.D.R.R.X.XY\.]^`.ab&&--l1o.F.FG 6 ) *4&&'3|+<< 0T5H5H1I0J K""%l"3!4A7  #(,,t/B/B%--"XD"'**S->emm"TD %,)-\1B(B%5t<<)B(C%()S->>;C@Y<Z;[ \%%(%6$7r; 
 :C3G`Co9p5A5'+77*2.;.K.KKL^L^LlLl.m+*77?8O8Y8Y+5 3J2V2V""//F)!, :q *C& "&!7!7%)%?%?"#33++ZZ
ZZ
ZZ
"&"9"9&*&A&A#+/+K+K()-)G)G&373[3[0,0,M,M)"11!//)-)G)G&*.*I*I'$($=$=!4#<#<s#B%a  t'E'E'P%:   $33 }o66,88<..36wamatatav6w3w3w &p 
 #11<<151B1B1ND--TXT`T`
 !% 48/0"'%-! B 	 	
$ II	99!DN5!! "DN  //9F"<1E1Ea1HIL)99(XFWXDN $U+~~)(8 ,..!m  )<(=D%#;YY ,,!.. ,,"ii3...&*&@&@$D  #.d"3[=NO#$ #33(,(E(E%(,(E(E% $"<"<=4#=#=>t'A'AB"#STt'A'AB

 	D1===$&&!4 
 ~~)##3300<#'#<#<%,T-B-B,C1TEZEZD[#\'18X\XpXp'qD$$$66ejj>W>W>Y6Z:-''558V8VVZ[[$5d6T6T5U V ,,::;9F 
 11A5','8'8'U'U &+4+;+;+I+ITMkMk+k%l%l !q4+I+I'IAPQEUYUsUsKs!tu%l($DM1 &))9)9)G)G%H

6"+.t/?/?/S/S+T

<(+.t/?/?/M/M+N

<(,.JJNN=+,V

=),.JJNN=',R

=)))5$:T:T:`$($:$:T=W=W$WM$(M ,,99333HHNNN+ #YZ^ZhZhYiik!lmm)-)H)HD&%'D"..0 #'"<"<! ) 6 6 ) 6 6 ) 6 6#//&*&=&=(,(A(A! **8;!"4526!,/25!,/%%1!(()?)?@%5%J8I%JD"
 */& 	

!!$//2>>%(((!24>>4CSCS!T%%%!-dnnd>N>N!O!%!1!1!?!?`d!?!e2T^^Y]YiYijk'(9(9:NA{+77,,,+<[$JZJZ+[D%%a( ,0+;+;+I+I#TD ,J ,D%%a( ;Cs   	=AE(c                 2    U R                   c
  SS/U l         g g )Nr  r  )_signature_columns)rh   s    rm    _set_signature_columns_if_needed4_UnslothGRPOTrainer._set_signature_columns_if_needed  s"    
 ""*'/&9D# +rp   c                 x   U R                   c  [        S5      eU R                   nU R                  n[        5       (       a0  [	        U[
        R                  5      (       a  U R                  USS9nOU R                  USS9nU R                  U R                  R                  -  UU R                  R                  U R                  R                  U R                  R                  S.n[	        U[        R                   R"                  R$                  5      (       d  U R'                  5       US'   U R                  R(                  US'   [+        [,        U R                  R                  U R                  R.                  S9US'   U R                  R0                  US	'   U R2                  R5                  [7        U40 UD65      $ )
Nz+Trainer: training requires a train_dataset.training)description)r   
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_last)r  rankworker_init_fnprefetch_factor)ri  r   r  r4   r|  r(   r   _remove_unused_columns"_get_collator_with_removed_columns_train_batch_sizeri   r6  r  r  r	  rU   utilsdatar   _get_train_samplerr  rE   rN   r  r  rt  preparer   )rh   ri  r  dataloader_paramss       rm   get_train_dataloader(_UnslothGRPOTrainer.get_train_dataloader+  st   %JKK**** ""z-AQAQ'R'R 77S]7^M CCM_iCjM 004993Q3QQ'99;;))99"&))"I"I
 -)9)9)I)IJJ+/+B+B+Di(-1YY-K-Kk*29)I)IPTPYPYPgPg3./ 48993W3W/0''
=(VDU(VWWrp   datasetr   c           	         Uc  U R                   n[        UU R                  U R                  R                  U R                  -  U R
                  U R                  R                  -  U R                  U R                  R                  S9$ )N)data_sourcemini_repeat_countr   repeat_countshuffler  )	ri  r   r2  ri   r5  rI  r6  r4  r  )rh   r  s     rm   r   &_UnslothGRPOTrainer._get_train_samplerI  sq    2 ?((G"22yy66$:N:NN,,tyy/M/MM((
 	
rp   c                 T    [        UU R                  U R                  R                  S9$ )N)r  r  r  )r   r2  ri   r  )rh   rj  s     rm   _get_eval_sampler%_UnslothGRPOTrainer._get_eval_samplerm  s&    $"22
 	
rp   c	                 D   [        U5      (       a  UR                  R                  nX#S.n	Ub  Ub  XiS'   Ub  XYS'   Ub  XyS'   Ub  XS'   SU R                  ;   a  US-   U	S'   SU	S	'   UR                  " S0 U	D6R                  n
U
S S 2S S
2S S 24   n
U
S S 2U* S 2S S 24   n
U
$ )N)r   rg  rV  rU  rW  rX  r   r   F	use_cacher{   rM  )r7   
base_modelrd   r  last_hidden_state)rh   r  r   rg  r   rU  rV  rW  rX  model_inputsr  s              rm   _get_last_hidden_state*_UnslothGRPOTrainer._get_last_hidden_stateu  s     ))-88>>O &/Q %,*B-;)*#+7(+3G/0"*5' t444-;a-?L)*$)[!+11ALASS-a"ai8-a.1A1.DE  rp   	entropiesr   	thresholdc                 j   XR                  5          R                  5       nUR                  5       S:X  a#  [        R                  " U[        R                   S9$ [        R
                  " UR                  5       /UR                  S9nU R                  R                  U5      R                  5       R                  5       n[        R                  " XdR                  5       -
  UR                  S9n[        R                  " XG/5      n[        R                  " [        R                  " U5      U/5      n	U R                  R                  U5      n
U R                  R                  U	5      nXR                  5          n[        R                  " X5      nXR                  5       -  nX:  nXR                  5       -  $ )ah  
Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold.

Args:
    entropies (`torch.Tensor`):
        Tensor of shape (batch_size, seq_len) with per-token entropy values.
    mask (`torch.Tensor`):
        Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding.
    threshold (`float`):
        Quantile threshold between `0.0` and `1.0` to select high-entropy tokens.

Returns:
    `torch.Tensor`:
        Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold
        and `False` otherwise.
r   rz  r   )boolfloatnumelrU   r   r   r   rt  r-   r   rs  r1  cat	ones_likequantile)rh   r  r   r  non_pad_entropiesnon_pad_entropies_seq_length max_non_pad_entropies_seq_lengthpaddingpadded_entropiespadded_entropies_maskall_padded_entropiesall_padded_entropies_maskall_non_padded_entropiesentropy_thresholdmasked_entropiesentropy_masks                   rm   get_high_entropy_mask)_UnslothGRPOTrainer.get_high_entropy_mask  sf   " &iik288:""$)##IUZZ@@
 (-||5F5L5L5N4OXaXhXh'i$+/+;+;+B+BC_+`+d+d+f+k+k+m(++,/F/F/HHQbQiQi
 !99&7%AB %		5??;L+Mw*W X#//667GH$($4$4$;$;<Q$R!#78V8V8X#Y !NN+CO$zz|3'<iik))rp   c                 &   U(       a  g[        U S5      (       d  [        R                  R                  SS5      S:X  a  [        R
                  O[        R                  U l        [        R                  R                  SS5      S:X  a  [        R
                  U l        U	R                  SS 5      U	R                  S	S 5      pU	R                  S
S 5      U	R                  SS 5      pS[        R                  S'   U R                  R                  USS9n[        R                  R                  SU R                  S9   [        R                  " 5          U
cI  X R                  R                  :g  nUR                  UR                   5      nU" UUU
UUUS9R"                  nOU" UUU
UUUUS-   S9R"                  nS S S 5        S nU(       a  SSKJn  U" W5      nS S S 5        S[        R                  S'   WW4$ ! , (       d  f       ND= f! , (       d  f       N6= f)NNNrY  rZ  r[  r\  r]  r^  rU  rV  rW  rX  r_  Fr`  r  rd  rf  r   rh  r   )r,   )rg   rC   rm  r   rU   rn  ro  rY  rt  ru  rw  rx  r   rr  r   r   r  r   trl.trainer.utilsr,   )rh   rd   r   rg  r   r   compute_entropycompute_efficientri   rj   rU  rV  rW  rX  r  r   r  r,   s                     rm   "_get_per_token_logps_and_entropies6_UnslothGRPOTrainer._get_per_token_logps_and_entropies  s     4!2338:

Gcek8lpv8vu}}  }B  }K  }K$::>>"93?3F_d_l_lH\+1::nd+KVZZXhjnMo.06

;QRV0WY_YcYcdqrvYw+9<BJJ56"..;;EUZ;[O##&$BVBV#W))+#+*37L7L7Y7Y*Y)7):):>;O;O)P!0(1-;+7-;3G*5" !&  "1(1-;+7-;3G*5-;a-?" !&  ,0 !	"E 3F ;I9 X: :=BJJ569$$= ,+ XWs%   7HA&G13H1
G?	;H
Hextra_prefixesc                 ^    U=(       d    / nS/U-   nU H  nUR                  US5      nM     U$ )Nz_checkpoint_wrapped_module.r  )replace)rh   namer4  prefixesprefixs        rm   _fix_param_name_to_vllm+_UnslothGRPOTrainer._fix_param_name_to_vllm  s8    '-212^CF<<+D rp   r  r9  c                    Uc
  [        5       nUR                  5        H%  u  pEU(       a  U SU 3OUnU R                  XVUS9  M'     [        U[        5      (       a  [        R
                  " USSS9   UR                  5        H  u  pxU(       a  U SU 3OUn	U R                  U	S/S9n	X;   a  M-  UR                  U	5        U R                  S:X  aB  U R                  R                  (       a'  U R                  R                  XR                  5        M  U R                  S	:X  d  M   M     SSS5        gg! , (       d  f       g= f)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nrr  )r9  visitedF)recurse	writebackz_fsdp_wrapped_module.)r4  r  r  )setnamed_children_sync_fsdp1_params_to_vllmr|  r   summon_full_paramsnamed_parametersr:  addr?  rt  r  r  update_named_paramr  )
rh   r  r9  r=  
child_namechild_modulechild_prefix
param_nameparam	full_names
             rm   rB  ._UnslothGRPOTrainer._sync_fsdp1_params_to_vllm  s&    ?eG(.(=(=(?$J7=fXQzl3:L++7 ,  )@ fd##((%P)/)@)@)B%J<B6(!J< 8
I $ < <YXoWp < qI + KK	*~~1d6F6F6V6V((;;IzzR:5 *C QP $PPs   2B/D4%D44
Ec                 x   UR                  5        H  u  p#UR                  (       a%  UR                  [        R                  " S5      5      nUR                  5       nU R                  S:X  a8  U R                  R                  (       a  U R                  R                  X#5        M  U R                  S:X  d  M   M     g )Nr  r  r  )itemsis_cpur   rU   r   full_tensorr?  rt  r  r  rF  )rh   r  r7  rK  s       rm   _sync_fsdp2_params_to_vllm._UnslothGRPOTrainer._sync_fsdp2_params_to_vllm3  s    !<<>KD||f!56%%'E~~)d.>.>.N.N  33D@:- *rp   c                     g rE  rM  )rh   ri   rj   s      rm   _move_model_to_vllm'_UnslothGRPOTrainer._move_model_to_vllmB  s    4rp   generation_batchc                 l   U R                   R                  (       a  SOSnUS:X  a  U R                  R                  U R                  -  nU R
                  U-  S:X  d  U R                  cg  U R                  U5      n[        U5      n[        U5      n[        XR                  R                  5      nU Vs/ s H  n[        U5      PM     snU l        U R                  U R
                  U R                  R                  -     nU =R
                  S-  sl        U$ U R                  U5      nU$ !    N= fs  snf )Nr  r  r   r   )rd   r  ri   r6  rI  r  r  _generate_and_score_completionsrR   rQ   rS   rX   )rh   rW  modegenerate_everygeneration_batchesbatchinputss          rm   _prepare_inputs#_UnslothGRPOTrainer._prepare_inputsD  s   " **--w67?!YY;;d>Q>QQNzzN*a/43H3H3P#'#G#GHX#Y #=>N#O (=>N(O% &77GIgIg%h"Zl(mZlQV)Ee)LZl(m%**4::		8V8V+VWFJJ!OJ
  99:JKF (ms   D* 3D1*D.c           
        > U R                   R                  n[        R                  " [	        U5      [	        U R
                  5      US9nUS    Vs/ s H  owS;  d  M
  UPM     nnU VV	s0 s H  owU V	s/ s H  oU   PM	     sn	_M     n
nn	U R                  U
S'   [        [        U R
                  U R                  U R                  5      5       GHk  u  nu  pn[        X5         [        U[        R                  5      (       a  [        US   5      (       aE  [        X#5       VVs/ s H  u  nnSUU-   0PM     nnnU Vs/ s H  n[!        UU5      S   PM     nnO#[        X#5       VVs/ s H  u  nnUU-   PM     nnnU" USSS	S
S9n["        TU ]I  U5      n[        R&                  " 5          U" S0 UD6R(                  S S 2S4   US S 2U4'   S S S 5        O[U" SX#US.U
D6nU Vs/ s H  nUb  UO[        R*                  PM     nn[        R,                  " U[        R.                  US9US S 2U4'   S S S 5        GMn     [        R0                  " U5      R3                  SS9R5                  5       (       a  [        R0                  " U5      R3                  SS9R7                  SS9S   S   nU
R9                  5        VVs0 s H  u  nnUS:w  d  M  UUU   _M     nnnUU   US'   UU   US'   [:        R<                  " SU S35        [?        U5      nU$ s  snf s  sn	f s  sn	nf s  snnf s  snf s  snnf ! , (       d  f       GN= fs  snf ! , (       d  f       GM  = fs  snnf )Nr   r   )r  r  completion_idstrainer_statemessagestextptTrightFre  return_tensorsr"  padding_sideadd_special_tokens)promptscompletionsrb  r  r   r   r   )as_tupler  r  z=All reward functions returned None for the following kwargs:
zH
Please ensure that at least one reward function returns a valid reward.rM  ) rt  r   rU   r1  rl  rh  stater  r   rk  r  rK   r|  rA   r  r3   r%   r\  r_  r   r   nanr   r   isnanallr  nonzerorO  r;   r  r-   )rh   r^  rl  rm  completion_ids_listr   rewards_per_funckeyr  examplereward_kwargsr  r  r  reward_func_namepcrd  r   textsreward_inputsoutput_reward_funcrewardnan_row_idxvaluerow_reward_kwargsr^  s                             rm   _calculate_rewards&_UnslothGRPOTrainer._calculate_rewardsj  so   !!(( ;;s7|S9J9J5KTZ[  &ayby7a,aybNRSds6B6s|6BBdS *.o&KT!!4#A#A4CYCYZL
GAG6F #4:k29955(33DGD]#^D]DAqZQ$7D]#^bj kbj]^!4Q8O!PQW!Xbj k36w3L M3L41aQ3L M$;"4T[pu%M %*G$;M$JM--/1<1M}1M1T1TUVXYUY1Z(A. 0/ *5 * 'Qd*hu*& ew)wdvZ`F4F&EII*Udv&)w-2\\:LTYTaTajp-q$QT*) ;:L
4 ;;'(,,,37799++&67;;;BJJTXJYZ[\]^_K:G:M:M:O!:OJCSVZiSi'U;'':O  ! +2+*>h'.9+.Fl+NNPQbPc dZ Z ""23_ cBS $_ k M
 0/ *x% ;:2!s   	LL+
L5LL%AM&L$
:ML*M+L/
=1M.!L5M'M-M?M
ML$M5
M?M
M	r^  c                 8>  > U R                   R                  nU R                  R                  (       a  SOSnU Vs/ s H  oDS   PM	     nn[        R
                  " U5      n0 nSUS   ;   nU(       a_  U V	s/ s H  oR                  S5      PM     n
n	SU
 Vs/ s H  o/PM     sn0nU H$  n[        U[        5      (       d  M  [        USS9  M&     [        U R                  S	S 5      nUc  S
n[        S5      n/ nU H  n0 n[        U5      [        La  SU0nUR                  5       U-
  nU H)  nUU;   d  M  UU   n[        U5      [         L d  M$  UUU'   M+     [#        UU R                  40 UD6S   nUR%                  U5        M     U R                  " ScUSSSSS.UD6n[&        TjU ]Q  U5      nUS   US   nnU R*                  Gb   U R,                  U R.                  U R0                  /nU Vs/ s H
  nUc  M  UPM     nn[3        UUU R*                  U5      u  nnU Vs/ s H=  n[4        R6                  " S[4        R8                  " U R:                  5       S3S
U5      PM?     nnU R<                  bT  U Vs/ s HG  n[4        R6                  " S[4        R8                  " U R<                  5       S3U R<                  U5      PMI     nnU R>                  (       Ga  U R@                  S:X  aS  U RB                  RD                  (       a8  [F        RH                  RK                  5         U RL                  RO                  5         U RP                  RR                  U RT                  :w  a+  U RW                  5         U RP                  RR                  U l*        U R@                  S:X  Ga  [Y        U5      nU(       a  [Y        W
5      nU R                   RZ                  (       a  US S U R\                  2   nU(       a  WS S U R\                  2   nOS n[_        U S5         U R`                  Rc                  UUU R\                  U Rd                  U Rf                  U Rh                  U Rj                  c  SOU Rj                  U Rl                  c  SOU Rl                  U Rn                  U Rp                  U RB                  Rr                  S9n U S   U S   4n!S S S 5        OS n!W!/n"[u        U"SS9  U"S   u  n#n$[w        U R                   Rx                  [{        U5      -  U R                   Rx                  S-   [{        U5      -  5      n%U#U%   n#U$U%   n$GOU R@                  S:X  Ga  U Rp                  (       a  [}        U Rp                  S9n&OS n&SU Rd                  U Rf                  U Rh                  U Rj                  c  SOU Rj                  U Rl                  c  SOU Rl                  U Rn                  U&SS .	n'U RB                  Rr                  b%  U'R                  U RB                  Rr                  5        [        Sc0 U'D6n(U R                  S:  a  [{        U5      n)[        U R                  5       V*s/ s H  n*S PM     n+n*[F        R                  R                  U+XR                  S!9  U+ V,V-s/ s H  n,U,  H  n-U-PM     M     nn,n-U(       al  [        U R                  5       V*s/ s H  n*S PM     n.n*[F        R                  R                  U.W
U R                  S!9  U. V,Vs/ s H  n,U,  H  oPM     M     nn,nOS nOUnU(       a  W
OS nU(       aL  U(       aE  / n/[        UU5       H2  u  nn0U0b  U/R%                  USU00S".5        M!  U/R%                  U5        M4     OUn/[_        U S5         U RL                  Rc                  U/U(SU R                  R                  S#SS$9S%9n1S S S 5        W1 V2V s/ s H#  n2U2R                    H  n U R                  PM     M%     n#n2n U1 V2V V3s/ s H^  n2U2R                    HJ  n U R                   V3s/ s H/  n3[        [        U3R                  5       5      5      R                  PM1     sn3PML     M`     n$n n2n3U R                  S:  aF  [F        R                  R                  U R                  S!9n4[w        U4W)-  U4S-   U)-  5      n5U#U5   n#U$U5   n$U RB                  RD                  (       a  U RL                  R                  SS&9  W# V6s/ s H  n6[F        R                  " U6US'9PM     n#n6[        U#U R                  S(9n#[F        R                  " UU#/SS)9n7W$ V8s/ s H'  n8[F        R                  " U8U[F        R                  S*9PM)     n9n8[        U9SS(9n9GOU R                  (       Ga  U R                  " ScS+U0UD6n:U R                  R                  R                  n;[        5       (       a  S,U R                  R                  lY        OS-U R                  R                  lY        [_        U S.5         [        U R                  U R                   U RB                  R                  S/9 n<[F        R                  " 5          U R                  (       a  [        R                  " U R                  SS09O	[        5          U RB                  R                  (       a   U<R                  [F        R                  5        O:U RB                  R                  (       a  U<R                  [F        R                  5        [F        R                  " 5          U<R                  U:R                  U R                  SS19n1S S S 5        S S S 5        S S S 5        S S S 5        S S S 5        W1R                  5        V s/ s H  n U R                  PM     n#n U# V6s/ s H  n6[F        R                  " U6US'9PM     n#n6[        U#U R                  S2S39n#U:R                   V6s/ s H  n6[F        R                  " U6US'9PM     nn6[        UU R                  SS39n[F        R                  " UU#/SS)9n7U;U R                  R                  lY        GO[_        U S45         [        U R                  U R                   U RB                  R                  S/9 n<[F        R                  " 5          U R                  (       a  [        R                  " U R                  SS09O	[        5          UUsUS'   US'   U<Rb                  " Sc0 UDU R                  SS5.D6n7S S S 5        S S S 5        S S S 5        S S S 5        UR                  S5      n=W7S S 2S U=24   nU7S S 2U=S 24   n#U#U R                  :H  n>[F        R                  " U>R                  S5      4U>R                  S5      [F        R                  US69n?U>R                  5       R                  SS)9U>R                  SS)9   U?U>R                  SS)9'   [F        R                  " U>R                  S5      US'9R                  U>R                  S5      S5      n@U@U?R                  S5      :*  R                  5       nA[        U#UAR                  5       5       VBVCs/ s H  u  nBnCUBUC   R                  5       PM     nDnBnCWAR                  S5      nEU R                   R                  UE5      nFUFR                  5       nGU R                  (       a3  U>R                  SS)9) nHWAUH) R                  S5      R                  5       -  nA[F        R                  " UWA/SS)9nIU#R                  S5      nJUS:X  a  U RB                  R                  OU RB                  R                  nKU(       d   [        U7U R                  R                  5      n7[F        R                  " 5          U RB                  R                  U R                  -  nLU RB                  GR                   UL-  S:w  d#  U R>                  (       at  U GR                  (       ab  U GR                  U R                  U7WIWJWKUR                  S75      UR                  S85      UR                  S95      UR                  S:5      S;9	u  nMn*OS nMU R>                  (       aM  U GR                  (       a;  [F        GR                  " WMW9-
  5      nN[F        GR                  " UNU GR
                  S<9nNU GR                  S:w  Ga  U GR                  bc  U GR                  U GR                  U7WIWJWKUR                  S75      UR                  S85      UR                  S95      UR                  S:5      S=9	u  nOn*OU R                   GR                  U R                  5      GR                  5          U GR                  U R                  U7WIWJWKUR                  S75      UR                  S85      UR                  S95      UR                  S:5      S=9	u  nOn*S S S 5        OS nOS S S 5        U R                  GR                  U#SS>9nPG[        US   5      (       aS  / nQ[        UWP5       H@  u  nnRUS   S?   S@:X  a  UGR                  5       SA   OS
nSWQR%                  S@USWR-   SB./5        MB     OWPnQU GR                  XWQWD5      nTUTU GR                  R                  U5      R                  S5      -  GR                  SS)9nUUUGR!                  SU R\                  5      GR#                  SS)9nVUVGR%                  U R\                  SS)9nVUUUV-
  nWU GR&                  SC;   aG  WUGR!                  SU R\                  5      GR)                  SS)9nXUXGR%                  U R\                  SS)9nXONU GR&                  SD:X  a"  WUGR)                  5       GR+                  UU5      nXOG[-        SEU GR&                   SF35      e[F        GR.                  " WX[F        GR0                  " UX5      5      nYU GR&                  SG:w  a  WWWXSH-   -  nW[w        U R                   Rx                  [{        U5      -  U R                   Rx                  S-   [{        U5      -  5      n%WWGR3                  5       nZUWU%   nWUS:X  ad  U RP                  =GR4                  U R                   R                  WIR                  5       5      R                  5       GR7                  5       -  sl        U RP                  GR4                  /U GR8                  U   SI'   U GR8                  U   SJ   R%                  WFGR;                  5       GR#                  5       GR7                  5       5        U GR8                  U   SK   R%                  UFGR;                  5       GR=                  5       GR7                  5       5        U GR8                  U   SL   R%                  UFGR;                  5       GR?                  5       GR7                  5       5        U R                   R                  U>R                  SS)95      n[UFU[   n\S[{        U\5      [{        UF5      -  -
  n]U GR8                  U   SM   R%                  U]5        [{        U\5      S:X  a  [F        GR@                  " SUS'9n\U GR8                  U   SN   R%                  W\GR;                  5       GR#                  5       GR7                  5       5        U GR8                  U   SO   R%                  U\GR;                  5       GR=                  5       GR7                  5       5        U GR8                  U   SP   R%                  U\GR;                  5       GR?                  5       GR7                  5       5        G[C        U GRD                  5       H  u  n^n_[F        GRF                  " WTS S 2U^4   5      GR7                  5       n`U GR8                  U   SQU_ SR3   R%                  U`5        G[I        UTS S 2U^4   5      GR7                  5       naU GR8                  U   SQU_ SS3   R%                  Ua5        M     U GR8                  U   ST   R%                  WVGR#                  5       GR7                  5       5        U GR8                  U   SU   R%                  WXGR#                  5       GR7                  5       5        U GR8                  U   SV   R%                  WYGR;                  5       GR#                  5       GR7                  5       5        U GRJ                  S   GRM                  [Y        U5      5        U GRJ                  SW   GRM                  [Y        WP5      5        G[C        U GRD                  5       H>  u  n^nbU GRJ                  SX   Ub   GRM                  WTS S 2U^4   R                  5       5        M@     U GRJ                  SY   GRM                  WZR                  5       5        U(       a)  U GRJ                  S   GRM                  [Y        W
5      5        U R>                  (       GaK  U GR                  (       Ga8  [F        GRN                  " WMW9-
  5      ncUcWAR                  5          ncUcGRQ                  5       S:  a  [F        GR"                  " Wc5      O[F        R                  " SUS'9ndWcGRQ                  5       S:  a  [F        GR>                  " Wc5      O[F        R                  " SUS'9neU GR8                  U   SZ   R%                  U R                   R                  Wd5      GR#                  5       GR7                  5       5        U GR8                  U   S[   R%                  U R                   R                  Ue5      GR?                  5       GR7                  5       5        WNWAR                  5          nfUfGRQ                  5       S:  a  [F        GR<                  " Wf5      O[F        R                  " SUS'9ngWfGRQ                  5       S:  a  [F        GR"                  " Wf5      O[F        R                  " SUS'9nhWfGRQ                  5       S:  a  [F        GR>                  " Wf5      O[F        R                  " SUS'9niU GR8                  U   S\   R%                  G[S        U R                   R                  Wg5      5      GR7                  5       5        U GR8                  U   S]   R%                  U R                   R                  Wh5      GRG                  5       GR7                  5       5        U GR8                  U   S^   R%                  G[U        U R                   R                  Ui5      5      GR7                  5       5        UUU#WAWWWGS_.n WMb  WMU S`'   U R>                  (       a  U GR                  (       a  WNU Sa'   WOb  WOU Sb'   S7U;   a  US7   U S7'   S8U;   a  US8   U S8'   S9U;   a  US9   U S9'   S:U;   a  US:   U S:'   U $ s  snf s  sn	f s  snf s  snf s  snf s  snf ! , (       d  f       GN1= fs  sn*f s  sn-n,f s  sn*f s  snn,f ! , (       d  f       GN]= fs  sn n2f s  sn3f s  sn3n n2f s  sn6f s  sn8f ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN"= fs  sn f s  sn6f s  sn6f ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= fs  snCnBf ! , (       d  f       GN= f! , (       d  f       GN#= f)dNr  r  r  r  r   imagesr   )
num_imageschat_templater  )r  chosenrejectedr  rd  labelrf  TleftFrh  r   rg  z^(z)+(r  r  zvLLM.generater{   r   )rl  r  nr;  r   r7  r8  r9  
max_tokensr  r:  rb  logprobs)from_process)regex)	r  r;  r   r7  r8  r9  r  guided_decodingr  )r  )r  multi_modal_datagrpo_trainer_lora_model)load_tensors)r  use_tqdmlora_requestr  r   )padding_valuer   )r   r  re  paged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r>  )r  progress_barrg  )r  rj  ztransformers.generate)r  disable_compilern  rU  rV  rW  rX  )rU  rV  rW  rX  r   )r   rU  rV  rW  rX  )skip_special_tokensrole	assistantcontent)r  r  )r  noner]  z!Invalid value for scale_rewards: z-. Must be one of 'batch', 'group', or 'none'.r  g-C6?
num_tokenszcompletions/mean_lengthzcompletions/min_lengthzcompletions/max_lengthzcompletions/clipped_ratioz"completions/mean_terminated_lengthz!completions/min_terminated_lengthz!completions/max_terminated_lengthrewards//mean/stdr  
reward_stdfrac_reward_zero_stdr  r  r   z&sampling/sampling_logp_difference/meanz%sampling/sampling_logp_difference/maxz&sampling/importance_sampling_ratio/minz'sampling/importance_sampling_ratio/meanz&sampling/importance_sampling_ratio/max)
prompt_idsprompt_maskrb  r|  r   num_items_in_batchold_per_token_logpsimportance_sampling_ratioref_per_token_logpsrM  )rt  r   rd   r  r'   deepcopyr   r|  r  rH   r  rr  r@  ry  r  r  rZ  r=   r   r\  r_  r1  ru  rv  rw  rW   rM   subescaper  rt  r>  r?  ri   rA  rU   r  empty_cacher  wake_uprp  global_stepr  rU  r.   r  r2  rK   r  generater;  r   r7  r8  r9  r   r  r:  r&   slicer  rl  GuidedDecodingParamsr  r  rH  ri  r  all_gather_objectr  r   	load_loraoutputs	token_idsr  nextiterr  logprobget_rankr  r   rD   r   r  r   r<  model_wrappedr  _attn_implementationr5   rY   r3  r   r  r   rC  rB   r  r   ro  r[  rn  r   generate_batchr   r  generated_tokensr   r  fulllongrb  argmaxr  r   expandr   r  tolistr   r-   rM  r  r  r   r6  rI  r  rS  r2  r   r   rT  r   r  ru  disable_adapterbatch_decoder3   popr  rK  nansumviewr   repeat_interleaverL  std	expand_asr   iscloser   clonenum_input_tokens_seenrs  r  r  r   r   r1  r  r  nanmeanr@   r  extendabsr  r?   r>   )krh   r^  r   rZ  r   rl  original_promptsrj   
has_imagesrx  r  imgr  _chat_template__supported_keys_prompts_text	_example__tokenizer_kwargs__left_keys_kv_x_prompt_inputsr  r  	protectedr   re  all_prompts_text
all_imagesordered_set_of_promptsordered_set_of_imagesrk   payloadobj_listrb  all_logprobsprocess_slicer  r:  r  	orig_sizer  gathered_promptssublistr{  gathered_imagesvllm_inputsr  all_outputsr  lplocal_rank_in_grouptp_sliceidsprompt_completion_idsr  sampling_per_token_logpspaged_prompt_inputsprevious_attnr  prompt_lengthis_eoseos_idxsequence_indicesr|  rowmask_rowru  completion_lengthsagg_completion_lengthsr  truncated_completionsrg  r   r   r[  r  r  r  completions_textrm  r  	bootstraprv  r  mean_grouped_rewardsr   std_rewardsis_std_zeroall_process_advantagesagg_terminated_with_eosterm_completion_lengthsclipped_completions_ratior  rz  mean_rewardsstd_func_rewardsr7  r   
mean_delta	max_deltaflat_is_ratiomin_importance_sampling_ratiomean_importance_sampling_ratiomax_importance_sampling_ratior^  sk                                                                                                             rm   rY  3_UnslothGRPOTrainer._generate_and_score_completions  s    !!((**--w6(./1X;/
  ==1
 q	)
:@A&wkk'*&FA& 9&3& 9:F!fd++/1E "
 "$"7"7$O"bObcI!#Id*%y1	#..*-==K '!!AAw#~01*1-	 !
 ,It7L7LcPbcdlmC$   -- 
$
 
 />"/"<mL\>]K
!!- ,,d.H.H$JbJbcI,5KI5IIK&DK)?)?'#J _kk^jVZBFFb4>>)B(C2#FDQ^jLk +eq eq]aBFFa		$*:*: ;<B?AQAQSWXeq    ===~~+		0P0P

&&(  " zz%%)?)??((*)-)?)?& ~~)#0#> !.v!6J##33 .>>UAUAU>U-V*!0:;Rd>R>R;R0S-04-*4A!%!1!1!:!:$:#8"22/3/F/F(,(8(8"&**(,

(:"

)-);#'+'A'A262L2L.2ii.I.I "; " $**:#;VJ=O"P BA  #G $9%hQ?/7{, %$$22S\A%%33a73w<G! "0!>+M: :---&:A[A[&\O&*O *.*A*A#'#3#3!ZZ#'::#5R4::$(JJ$6SDJJ"&"<"<'6 !
%! 99..:%,,TYY-H-HI"0"E3D"E11A5 !$L 1I6;D<Z<Z6['\6[6[$'\%%778H,^k^k7l9I'[9IgSZaSZ9I$'[!9>t?]?]9^*_9^A49^*_));;OV[_[h[h;i9H%\gT[ScT[c
%\
%)
'3$+54J*"$K),-=z)J ,'..&W^`eVf/gh'..v6	 *K #3K&t_="&(("3"3KQ`kp  BF  BL  BL  BV  BV  Wp  AE  BV  BF"3  #GK > CN!l+w\c\k\kRX&"2"2\k"2+!l $/ #.")// @FOT$ryy{+,44O"1 P#.    11A5 +0*;*;*D*D4==*D*Y'$%89%DGZ]^G^bkFklH%3H%=N#/#9L9933HHNNN+ KYY.3ell3v>.NY t?P?PQN$)IIz>.JPQ$R![g([gxXfEMMJ[g % ( (++CSV'W$((( #'"7"7"T\"TV"T ..55JJM(**AR""))>AM""))>!$(EF+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuu 99>>#&&u~~6YY^^#&&u}}5))+"1"@"@+55I_I_ns #A #K , v   G  EPDVDVDXYDX&f55DXNYJXY.3ell3v>.NY t?P?P_fgNFYFcFcdFcs%,,s6:FcJdZt7H7HW]^J$)IIz>.JPQ$R!=JD%%: "$(?@+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuuNXZeKk*M:J,K(7(@(@ )#)7;7M7M_c)% v   A 'OOA.M.q.=./@AJ21mn3DEN  4#4#44**fkk!n.AejjY_`%+ZZ\%8%8Q%8%?

q
@Q%R

q
!" <<AvFMMfkkZ[n^`a+w/@/@/CCHHJ LO~_n_s_s_uKvwKv-#xs8}335Kvw -003!%!1!1!8!89K!L3779 **%+ZZAZ%6$6!-2G1G0R0RST0U0Y0Y0[[O K#AqI',,Q/>BgoTYY::SWS\S\SwSw
$56KTMbMbMoMo$p!]]_ "YY;;d>Q>QQNyy44~EJ$"J"J"J)-)P)PJJ)""!.!2!2>!B#0#4#45E#F)6):):;Q)R - 1 1- @ *Q 
*&#Q '+# }}!I!I!I,1II6ILd6d,e),1KK-43T3T-)
 yyC>>--1-T-T-&&#-%2%6%6~%F'4'8'89I'J-:->->?U-V$1$5$5m$D .U 
.*' ))66tzzBRRT151X1X JJ1**'1)6):):>)J+8+<+<=M+N1>1B1BCY1Z(5(9(9-(H 2Y 
2.+Q UT '+#{ @  00==nbf=gVAY''K&)'3C&D"
7=bz&7I[7XFJJL3^`	""[YQ[E[$\#]^ 'E +K
  226[Zmn $d&9&9&<&<V&D&N&Nq&QQYY^_Y`  '||B0D0DEJJqJQ  4EEdFZFZ`aEb33
!22!,,r4+?+?@DDDKK%778L8LRS7TK7*!++-11':K3D4F4F3GGtu  mmK1A1A+1NO'#{T'9:J **S\9++a/3w<?
 ",!1!1!3.
 7?JJ,,0@0@0G0GHZHZH\0]0a0a0c0h0h0jj,-1ZZ-M-M,NdL) 	d56==>T>Z>Z>\>a>a>c>h>h>jkd45<<=S=Y=Y=[=_=_=a=f=f=hid45<<=S=Y=Y=[=_=_=a=f=f=hi #'"2"2"9"9&***:K"L"89P"Q$%,C(DsKaGb(b$b!d78??@YZ&'1,&+kk!F&C#d@AHHI`IfIfIhImImIoItItIvwd?@GGH_HeHeHgHkHkHmHrHrHtud?@GGH_HeHeHgHkHkHmHrHrHtu $-T-C-C#DA ==)9!Q$)?@EEGLMM$(+;*<E BCJJ<X%&6q!t&<=BBDMM$(+;*<D ABIIJZ[	 $E
 	dH%,,-A-F-F-H-M-M-OPdL)001A1A1C1H1H1JKd23::;;L;L;N;S;S;U;Z;Z;\] 	

8##M,$?@

< ''6F(GH !7!78GAtJJy!$'../?1/E/L/L/NO 9

< ''(>(E(E(GHJJw&&}V'<====TEEEII14LLME/..01E.3kkma.?E*U\\RU^dEeJ,1KKMA,=		%(5<<PS\bCcIMM$ HIPP  ''
388:??A MM$ GHOO  ''	2668==? 6o6J6J6LMM,9,?,?,AA,E		-(5<<X[djKk * .;-@-@-BQ-F

=)ELLY\ekLl + -:,?,?,AA,E		-(5<<X[djKk * MM$ HIPPt''../LMNSSU MM$ IJQQ  ''(FGOOQVVX MM$ HIPPt''../LMNSSU
 %&,.$"4
 *,?F()==TEEE2KF./*,?F()]*%2>%BF>"},'45E'FF#$!]2-:;Q-RF)*M)$1-$@F=!e 0 B 9L L
 l @ BAp (]'[ +`%\" >= "mO $ Z(8 ,+ vu    GF  ZYd vu    A@2 xL UTa _s  Aw6AwAw.Aw9Aw AAw!7AAw&B0Aw+Aw=AxAx>Ax5Ax
*Ax%=$Ax0!6Ax+Ax04Ax7.Ax<4Az	Ay7";Ay%BAy	(&AyAy	Ay%Ay7&Az	Az$Az (Az%4A{ A{(;Az<#-Az*	Az<A{ A{ ) A{2"GA|
FA"A{8G#A|
w+
Aw:x
Ax"x+Ax0y
AyyAy	y
Ay"yAy%y%
Ay4y/Ay7y7
Az	zAz	z	
Azz*
Az9z4Az<z<
A{{A{{
A{	{A{ { 
A{/{8
A|	|A|
|

A|c                    US   US   pCUS   US   pe[         R                  " X5/SS9n[         R                  " XF/SS9nUR                  S5      n	U R                  UUUU	UR	                  S5      UR	                  S5      UR	                  S	5      UR	                  S
5      5      n
U R                  U
UR                  R                  UUUS   UR                  R                  UR	                  S5      UR	                  S5      S9u  pU R                  S:w  a  US   OS nUS   nU R                  R                  (       a  SOSnU R                  S:w  aV  U R                  U   S   R                  U R                  R                  U5      R!                  5       R#                  5       5        U R                  U   S   R                  U R                  R                  U5      R!                  5       R#                  5       5        XR$                  -  $ )Nr  r  rb  r|  r   r   rU  rV  rW  rX  r   r  r  )_input
lin_weightselected_token_idsrg  r   biasr  r  r   r   r{   r  r  kl
clip_ratio)rU   r  r   r  r   r  r  rq  r  r   rd   r  r  r   rt  r-   r   rs  #current_gradient_accumulation_steps)rh   r  r^  r  r  rb  r|  r   rg  r   r  r  metricsr   r  rZ  s                   rm   compute_liger_loss&_UnslothGRPOTrainer.compute_liger_loss
  s   "("6}8MK*01A*BFK\D]IIz:B	K#AqI',,Q/ !77JJ~&JJ'(JJ-.JJ}%	
 ,,$&..55-*l+ ((-- &

+@ A &

+@ A - 	
 !%		S 0'!*dR[
**--w699MM$%,,T-=-=-D-DW-M-R-R-T-Y-Y-[\dL)001A1A1H1H1T1Y1Y1[1`1`1bc>>>>rp   c                 r  ^  U(       a  [        S5      eUS   US   peUS   US   pUR                  SS 5      UR                  SS 5      pUR                  SS 5      UR                  S	S 5      p[        R                  " XW/S
S9nUR                  u  p[        R                  " Xh/S
S9nUR                  S
5      nUnUnS.U 4S jjnU" XUUSS9nUR                  SS 5      nUS   nUR                  SS 5      nUS S 2U* S 24   n[        UR                  SS5      nUc  Sn[        UR                  SS5      nUc  Sn[        UR                  SS5      nUc  SnUb  Ub  US S 2S S2S S 24   nUb  US S 2S S2S S 24   nUS S 2S S2S S 24   n[        UUUUUT R                  U4U	U
T R                  R                  T R                  T R                  T R                  T R                  R                  T R                  R                   T R                  R"                  UUUS.6u  nnnGO,[%        T R                  S5      (       a  ['        S/0 ST _SU_SU	_SU
_SU_SU_SU_SU_SU_ST R                  R(                  _ST R                  R                  _ST R                  _S T R                  _S!T R                  _S"T R                  R                  _S#T R                  R                   _S$T R                  R"                  _S%U_S&U_S'U_S(U_6u  nnnOA['        T UUUUUUT R                  R(                  T R                  R"                  UUUUS)9u  nnn  S*T R*                  ;   a  T R,                  R.                  (       a  S+OS*nT R*                  U   S,   R1                  UR3                  5       5        T R*                  U   S-   R1                  UR3                  5       5        U$ T R*                  S,   R1                  UR3                  5       5        T R*                  S-   R1                  UR3                  5       5        U$ )0Nz2The GRPOTrainer does not support returning outputsr  r  rb  r|  rU  rV  rW  rX  r   r   c           	      z   > [        TS5      (       a  TR                  XX#U5      $ TR                  XX#XEU5      S   $ )N_get_per_token_logpsr   )rg   r%  r2  )rd   r   rg  r   r   r0  r1  rh   s          rm   r  2_UnslothGRPOTrainer.compute_loss.<locals>.<lambda>
  s\    t344 %%eXij Y33Enfp  DU  V  WX  YYrp   T)r1  r  r   r  final_logit_softcappingr   logit_scalelogits_scalingr{   )rU  rV  r   r   r   r   r   r   r   r   r   r   r   r{  r   r   r  r  r?  r   r   r   r   r   r   r   r   r   rg  )r{  r   r   r|  r   r  r  r?  r   r   r   r   rg  r  r  r   r  NFFrM  )r   r   rU   r  r   r   r  r  r  r   ri   r   r   r   r   r   r   r   rg   r  r  r  controlshould_evaluater   rs  ) rh   rd   r^  return_outputsr  r  r  rb  r|  rU  rV  rW  rX  r   r}  r~  rg  r   r;  _logits_to_keepget_logps_funcr   r  r   r  r   r   r   r  r   r   rZ  s    `                               rm   r   _UnslothGRPOTrainer.compute_loss
  s   QRR"("6}8MK*01A*BFK\D]'-zz.$'GTdfjIkn,2JJ7Md,SU[U_U_`mnrUskIIz:B	OO	K#AqI',,Q/
(Y 	 )>>ost #JJ'<dC L)
 #JJ'<dCa.!112	 $ELL2KQO$!&7&u||]AF')=$U\\3CQG%A'9& ,$5a"ai$@! ,$5a"ai$@!-a"ai8O/E!!		0  ,!/ II//,0,J,J"..#00(,		(G(G		"ii33$5';%7'0,D#W, tyy+..3H 4"4 *4 $04 &4	4
 &44 '64 ",4 ):4 ):4  $yy;;4 !%		 3 34 150N0N4 #'"2"24 $(#4#44 -1II,K,K4  !IIOO!4" #'))"7"7#4$ ):%4& ,@'4( *<)4* &4+40'2 4I" *%3&5!+(9(9#yy;;"&))"7"7(9+?);%340' 
 dmm#!\\996wDMM$ 34;;<M<R<R<TUMM$%,,W\\^<  MM-.556G6L6L6NOMM$&&w||~6rp   c                   ^&^' US   US   pCUS   US   snm&[         R                  " X5/SS9n[         R                  " UT&/SS9nUR                  S5      nU R                  UUUUSUR	                  S5      UR	                  S	5      UR	                  S
5      UR	                  S5      S9	u  pU R
                  S:  a!  U R                  U
T&SU R
                  -
  5      nOS nU R                  S:w  a%  US   n[         R                  " X-
  5      X-
  -
  S-
  nUS   nUR	                  S5      nUc  U	R                  5       OUnX-
  nU R                  S:X  a  UnOnU R                  S:X  aE  UT&-  R                  S5      T&R                  S5      R                  SS9-  nUR                  S5      nO[        SU R                   S35      e[         R                  " U5      n[         R                  " USU R                  -
  SU R                   -   5      nU R"                  R$                  b)  [         R                  " UU R"                  R$                  S9nUUR                  S5      -  nUUR                  S5      -  n[         R&                  " UU5      * nUb  UU-  nU R(                  (       a  U R*                  (       a  UUS   -  nU R                  S:w  a  UU R                  W-  -   nU R,                  S:X  aQ  UT&-  R                  S5      T&R                  S5      R                  SS9-  R/                  5       nUU R0                  -  nOU R,                  S:X  aA  UT&-  R                  5       T&R                  5       R                  SS9-  nUU R0                  -  nOU R,                  S:X  aB  UT&-  R                  5       UR                  S5      U R2                  -  -  nUU R0                  -  nO[U R,                  S:X  a3  US   U R4                  R6                  -  nUT&-  R                  5       U-  nO[        S U R,                   35      eU R8                  R:                  (       a  S!OS"nT&R                  5       R                  SS9m'U&U'4S# jnU R                  S:w  a^  U" W5      nU R<                  U   S$   R?                  U R4                  RA                  U5      RC                  5       RE                  5       5        U" U
5      nU R<                  U   S%   R?                  U R4                  RA                  U5      RC                  5       RE                  5       5        USU R                  -
  :  UR                  S5      S:  -  nUSU R                   -   :  UR                  S5      S:  -  nUU-  nU" URG                  5       5      n U" URG                  5       5      n!U" URG                  5       5      n"U R4                  RA                  U 5      n#U R<                  U   S&   R?                  U#RC                  5       RE                  5       5        U R<                  U   S'   R?                  [I        U#5      RE                  5       5        U R4                  RA                  U!5      n$U R<                  U   S(   R?                  U$RC                  5       RE                  5       5        U R<                  U   S)   R?                  [K        U$5      RE                  5       5        U R4                  RA                  U"5      n%U R<                  U   S*   R?                  U%RC                  5       RE                  5       5        U$ )+Nr  r  rb  r|  r   r   TrU  rV  rW  rX  )r0  rU  rV  rW  rX  r   r   r  r   r  r   r   r{   r   r   r   r   r  r   r   r   r   r  r  r   r  r  c                 v   > U R                   S   S:X  a  U R                  5       $ U T-  R                  5       T-  $ r   )r   r   r   )r   r|  completion_token_counts    rm   r   <_UnslothGRPOTrainer._compute_loss.<locals>.masked_batch_mean  s7    wwqzQvvxO+0025KKKrp   r  entropyzclip_ratio/low_meanzclip_ratio/low_minzclip_ratio/high_meanzclip_ratio/high_maxzclip_ratio/region_mean)&rU   r  r   r2  r   rQ  r+  r   r   r   r   r   r   r   r   r   r   ri   r   r   r>  rS  r   r   r  r   rt  r  rd   r  r  r   r-   r  rs  r  r?   r>   )(rh   rd   r^  r  r  rb  r   rg  r   r   r  r*  r  per_token_klr   r  r   r   r  r  per_token_loss1per_token_loss2per_token_lossr  
normalizerrZ  r   r   mean_entropyis_low_clippedis_high_clippedis_region_clippedlow_clip	high_clipr  gathered_low_clipgathered_high_clipgathered_clip_ratior|  r3  s(                                         @@rm   _compute_loss!_UnslothGRPOTrainer._compute_lossN  s   "("6}8MK*01A*BFK\D]'IIz:B	K#AqI',,Q/ &*%L%L N3!::&67!',B!C

=1 &M 
&
" $$s*55iRSVZVoVoRopLL 99"()>"?		-?@DWDijmnn 
 L)
 %jj)>?:M:Uo446[n#9	))W4%."++z9&//&A%F%Fr%J_M`M`acMdMjMjorMjMs%s"%;%E%Eb%I"5d6T6T5U V" "  12VQ)9)9%91t?P?P;PQ 99??&[[TYY__=F :#7#7#:: :#7#7#::))O_EE#+l:N==TEE+f5P.QQN99+dii,.FFN>>V##o5::2>ATATUWAXA^A^cfA^AggmmoD$BBBD^^v%"_499;o>Q>Q>S>Y>Y^a>Y>bbD$BBBD^^y("_499;~?R?RST?UX\XrXr?rsD$BBBD^^v% 458H8H8V8VVJ"_499;jHD24>>2BCDD **--w6!0!4!4!6!<!<!<!E	L 99'5GMM$%,,T-=-=-D-DW-M-U-U-W-\-\-^_(3dI&--d.>.>.E.El.S.[.[.].b.b.de !1t'7'7#77J<P<PQR<SVW<WX!A(9(9$99j>R>RST>UXY>YZ*_<$^%9%9%;<%o&;&;&=>	&'8'>'>'@A
 ,,33H=d1299:K:S:S:U:Z:Z:\]d0188@Q9R9W9W9YZ!--44Y?d23::;M;U;U;W;\;\;^_d1299&AS:T:Y:Y:[\"..55jAd45<<=P=X=X=Z=_=_=abrp   ignore_keysc                 >   U R                  U5      n[        R                  " 5          U R                  5          U R	                  X5      nS S S 5        WR                  5       R                  5       nS S S 5        WS S 4$ ! , (       d  f       N9= f! , (       d  f       N$= frE  )r_  rU   r   compute_loss_context_managerr  r   r   )rh   rd   r^  r  rF  r  s         rm   prediction_step#_UnslothGRPOTrainer.prediction_step  su    %%f-]]_224((7 599;%%'D  T4 54 _s"   BA=
&B=
B	B
Blogs
start_timec           	        > U R                   R                  (       a  SOSnU R                  U   R                  5        VVs0 s H  u  pEU[	        U5      [        U5      -  _M     nnnUS:X  a(  UR                  5        VVs0 s H  u  pESU 3U_M     nnn0 UEUEn[        TU ]  X5        U R                  U   R                  5         U R                  R                  (       Ga  U R                  (       Ga  [        5       (       ab  [        U R                  S   U R                  S   U R                  S   U R                  S   U R                  R                   U R"                  5        U R$                  R&                  (       Gat  SU R$                  R&                  ;   GaX  [(        R*                  GbE  S	S Kn[/        U R                  R                   5      /[        U R                  S   5      -  U R                  S   U R                  S   S
.U R                  S   ESU R                  S   0EnU R                  S   (       a[  / US'   U R                  S    HC  n	U	b)  US   R1                  [(        R3                  U	5      5        M/  US   R1                  S 5        ME     UR5                  U5      n
U R6                  (       a  U
R9                  S/S9n
[(        R                  S[(        R;                  U
S905        g g g g g g s  snnf s  snnf )Nr  r  eval_r  r  r  r   wandbr   )stepr  r  	advantager  )subsetrm  )	dataframe)rd   r  r  rO  r   rl  r\  logclearrt  r  rU  r8   rJ   r  rp  r  rV  ri   r  rO  runpandasrZ  r   Image	DataFramerW  drop_duplicatesTable)rh   rK  rL  rZ  rw  valr   pdtabler  dfr^  s              rm   rT  _UnslothGRPOTrainer.log  s   **--w6<@MM$<O<U<U<WX<W3C3s8++<WX 6>:A--/J/hcse}c)/GJ"$"'"D%d!!#+++0D0D0D ""/JJx(JJ|,JJy)JJ|,JJ**11 yy"""w$))2E2E'E%))J_# !!7!789C

8@T<UU"jj2"&**\": jj+	
  L!9 ::g&%'E'N#zz'2?!'N11%++c2BC!'N11$7  3 \\%(00++H:+>B		=%+++*CDE/ K`'E" 1E+ Y
 Ks   $K:L c                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nrq  r{   )r  )	ri   r  r   r  r7  r  create_model_cardr\  _save_checkpoint)rh   rd   trialr  r^  s       rm   rc  $_UnslothGRPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .rp   r  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr  unsloth_versionunslothJOB_IDhf_jobsa              @article{shao2024deepseekmath,
                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
                year         = 2024,
                eprint       = {arXiv:2402.03300},
            }
            GRPOzRDeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Modelsz
2402.03300)r  r  r  rf  rg  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerorg   rd   r  rC   pathisdirr  r@  r|  rZ  rE  rm  r  r  rT   dedentr/   r  r:   rO  rV  urlr0   savejoinri   r  )rh   r  rf  rg  r  citation
model_cards          rm   rb  %_UnslothGRPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ??	
 )!!**%'9';';		@Ueiimm[_.0%l!

 	TYY%9%9;GHrp   ):rY  r  r  r  r  r  r  r  r  r   r  r   r   r  r  rt  ru  r   r  r  rU  r   rM  r   r1  r9  r  r  rV  r2  rI  r  r   r  r;  r  rh  rk  rK  rL  r4  r   rQ  r8  r7  r  rR  r<  r>  rw  rv  r  rG  rT  rS  r?  rH  rW  )NNNNNNr.  NrE  )NNNNr*  )r  N)FN)NNN)8rN  rO  rP  rQ  r`  r  r	   rZ  r   r   r  r   r   r   r   r  r   r   r"   tuplerU   r  	Optimizerlr_schedulerLambdaLRr]  r  r  r   r   r  rL   r  r   r  r+  r2  r:  rA   r  rB  rR  rU  r   r_  r  rY  r!  r  rD  rI  rT  rc  rb  rS  rc  rd  s   @rm   rf  rf  ;  s   _B J &*CGnrUYmq59jv.2xS/)*x JZ(889x z"	x
  g&> ?@x uWotCwXgOgIhDh?i%ijkx #5)@.)P#QRx $,E2I4PgKh2h,i#jx D12x (5;;#8#898EKKD\D\DeDe;ffgx l+x xt:$X<"
(7*; "
w "
H
 
  !*! *!X&*u|| &*5<< &*\a &*fkfrfr &*P quX]CJHT#Y<O  C :  @# $S%c0A*B%B C#	c5s*++	,# #J 4  4 lx4U5<<+<%= =>?x	c5s*++	,xt(?TGR~@ PXY]^aYbPc  /FS%Z( /Fhuo /FQU /F /Fd/ %)&*,0	CISMCI smCI CcD()	CI CIrp   rf  c                   >   ^  \ rS rSrSr       SU 4S jjrSrU =r$ )UnslothGRPOTraineriT  a  
    
Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
Models](https://huggingface.co/papers/2402.03300).

Example:

```python
from datasets import load_dataset
from trl import GRPOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")
def reward_func(completions, **kwargs):
    # Dummy reward function that rewards completions with more unique letters.
    return [float(len(set(completion))) for completion in completions]
trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=reward_func,
    train_dataset=dataset,
)

trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function, such as:
            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
            path to a *directory* containing model weights saved using
            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
            keyword arguments in `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
            - A custom reward function: The function is provided with the prompts and the generated completions,
              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
              functions can also return `None` when the reward is not applicable to those samples. This is useful
              for multi-task training where different reward functions apply to different types of samples. When a
              reward function returns `None` for a sample, that reward function is excluded from the reward
              calculation for that sample. For more details, see [Using a custom reward
              function](#using-a-custom-reward-function).

              The trainer's state is also passed to the reward function. The trainer's state is an instance of
              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
              reward function's signature.
        - A list of reward functions, where each item can independently be any of the above types. Mixing different
        types within the list (e.g., a string model ID and a custom reward function) is allowed.
    args ([`GRPOConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
        ignored. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. The padding side must be set to "left". If `None`, the
        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
        `tokenizer.eos_token` will be used as the default.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
        `None`, the tokenizer for the model is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
        are ignored.
    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.

    c
                   > Uc
  [        5       n[        USS5      n[        U5      [        La  Sn[        USS5      n[        U5      [        La  SnSn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U(       a  [        S5      eU(       d  U(       d  U(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU(       d<  U(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n[        US'S 5      nUc'  Ub$  UR<                  n[;        US'5      (       a  UUl        Ub!  [;        US(5      (       a  UR?                  5         S)[9        5       ;   a   [;        [@        S*5      (       a  S+[@        l!        S,[9        5       ;   aU  [;        US*5      (       a  S+Ul!        [;        US)5      (       a,  [;        UR@                  S*5      (       a  S+UR@                  l!        / n[E        U[F        5      (       d  U/nOUnU H<  n URH                  n URK                  S-U S.35         URK                  S-U S/35        M>     SS0K&J'n   U " S1U5        [        US2S 5      [P        RR                  :X  a(  URT                  S:  a  [        US3S5      S:w  a  SUl+        S4[9        5       ;   a!  [;        US(5      (       a  UR?                  5         [X        T#U ]  " S;UUUUUUUUU	S5.	U
D6  S4[9        5       ;   a!  [;        US65      (       a  UR]                  5         [;        U S75      (       a-  U R^                  Ra                  5         [;        U S75      (       a  U ?/[        US8S 5      b  U Rb                  UR                  5       l1         [;        U S95      (       aV  U Rd                  Rf                  n!Un"[;        U"S45      (       a&  U!U"l4        U"Rj                  n"[;        U"S45      (       a  M&  U!U"l4         [;        U S:5      (       a.  [m        [o        U Rp                  Rr                  5      U 5      U l9        g !    GM(  = f)<Nr  Fr[  UNSLOTH_ENABLE_FULL_FINETUNINGr]  r^  r\  zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONr   r  torch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r  rZ  rj  r  r  r  r  r  r   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r  r  r  r  r  ro  compute_metricspreprocess_logits_for_metricsUNSLOTH_RETURN_LOGITSmax_seq_lengthre   r  rj  rg  rr  r  r  r  )PatchRLStatisticsgrpo_trainerparallel_mode_n_gpurd   )	rd   rh  ri   ri  rj  rr  rk  rl  rn  rf   neftune_hook_handler'  rt  r  rM  ):r  r  ry  r  rC   rm  r   rX  r  get_input_embeddingsr  unsloth_zoo.utilsr  rU   rn  r  r[  r  r  r  rV   r  r]   r  r  r  r  r  localsrg   r  re   r  rj  r|  r  rN  r   unsloth_zoo.logging_utilsr  r`   NOT_DISTRIBUTEDn_gpur  r\  r]  rf   r  remover'  rt  r>  accelerator_scalerrd   ra   rs   r^  r  )$rh   rd   rh  ri   ri  rj  rr  rk  rl  rn  rj   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyper  r  rn  ga_stepstransformers_versioneval_bszr  r  _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr  other_metrics_reward_funcsr  rz  r  r>  current_modelr^  s$                                      rm   r]  UnslothGRPOTrainer.__init__  s    < 1 34/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4G!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW,--~}*m(K#.#7#7 !((84D3EU)KL!((84D3ET)JK ) 	@.-8 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  		0')'/(A!%		0 )/		0 fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJQ Ds   8];;^)r  )NNNNNNN)rN  rO  rP  rQ  r`  r]  rS  rc  rd  s   @rm   r  r  T  s-    aL $(L Lrp   r  	addFilterc                        \ rS rSrS rS rSrg)HideLoggingMessageiJ  c                     Xl         g rE  re  )rh   re  s     rm   r]  HideLoggingMessage.__init__K  s    d)rp   c                 <    U R                   UR                  5       ;  $ rE  )re  
getMessage)rh   r   s     rm   filterHideLoggingMessage.filterL  s    alln)DErp   r  N)rN  rO  rP  rQ  r]  r  rS  rM  rp   rm   r  r  J  s    2Erp   r  z`use_cache=True`)r{   )r`  rU   r   torch.nnrA   r   Ftypingr   r   r   r   r	   r
   r   r   trl.trainer.grpo_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rV   rW   rX   rY   dataclassesr[   r\   packaging.versionr]   numpyrj  
contextlibr^   r_   +TransformersDataCollatorForLanguageModelingtransformers.training_argsr`   rq   typesra   rs   r3  r2  r   rb  r   r   r   r  autogradFunctionr
  r  r  r  r  rf  r  rg   Filterr  r  rM  rp   rm   <module>r     s  0    $ I I I i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i  i 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL O,bJ	5>>22 J	j y	v 4;PQO, RO,b
 y	5
 y	5 y	5v XI' XIr8o, ob  6;FW^^ F 	
'(:;<  rp   