
    h                       S SK r S SKrS SKrS SKrS SKrS SKJrJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJrJrJrJr  S SKrS SKrS SKrS SKrS SKJr  S SKJrJrJrJrJr  S S	KJrJ r   S S
KJ!r!  S SK"J#r$  S SKJ%r%J&r&  S SKJ'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7J8r8  SSK9J:r:J;r;J<r<J=r=  SSK>J?r?J@r@  SSKAJBrB  SSKCJDrDJErE  SSKFJGrGJHrHJIrIJJrJ  SSKKJLrL  SSKMJNrN  SSKOJPrP  SSKQJRrRJSrSJTrTJUrUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrb  \7" 5       (       a  S SKcJdrdJere  \D" 5       (       a  S SKfJgrg  \E" 5       (       a  S SKhJiriJjrj  S SKkJlrl  \1" 5       (       a  S SKmrm\R                  " \o5      rp\\q\,\\r\r/\r\s   4   4   rt " S S \/5      rug)!    N)defaultdictdeque)nullcontext)partial)Path)AnyCallableOptionalUnion)logging)broadcast_object_listgathergather_objectis_peft_modelset_seed)DatasetIterableDataset)nn)FullyShardedDataParallel)
DataLoaderSampler)
AutoConfig"AutoModelForSequenceClassificationAutoProcessorAutoTokenizerGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_wandb_available)seed_worker)is_datasets_availableis_flash_attn_2_availableis_peft_availableis_rich_available   )apply_chat_templateis_conversationalmaybe_apply_chat_templateprepare_multimodal_messages)profiling_contextprofiling_decorator)
VLLMClient)is_liger_kernel_availableis_vllm_available)prepare_deepspeedprepare_fsdpprepare_peft_modelunwrap_model_for_generation)_ForwardRedirection   )SyncRefModelCallback)
GRPOConfig)RepeatSamplerdisable_dropout_in_modelentropy_from_logitsgenerate_model_cardget_comet_experiment_urlidentitynanmaxnanminnanstdpadprint_prompt_completions_sampleselective_log_softmaxshuffle_sequence_dictsplit_pixel_values_by_gridsplit_tensor_dicttruncate_with_protected_tokensunsplit_pixel_values_by_grid)
PeftConfig	PeftModel)LigerFusedLinearGRPOLoss)LLMSamplingParams)GuidedDecodingParamsc                     ^  \ rS rSrSrSS/r        S8S\\\4   S\\	\
\	   4   S\\   S	\\\\4      S
\\\\\\\\\4   4   4      S\\\\4      S\\\\
\   4      S\\
\      S\\\R*                  R,                     \\R*                  R.                  R0                     4   S\S   4U 4S jjjrS rS rS9S\\   S\4S jjrS\4S jr\    S:S j5       r S\RB                  S\RB                  S\"S\RB                  4S jr#\      S;S\\\\RB                     4   4S jj5       r$S9S\\
\      4S jjr%S<S \&RN                  S!\4S" jjr(S \&RN                  4S# jr)\S$ 5       r*\S%\\\\RB                  \+4   4   S\\\\RB                  \+4   4   4S& j5       r,\U 4S' j5       r-S(\
\\\\RB                  \+4   4      S\\\\RB                  \+4   4   4U 4S) jjr.S* r/\S=S+ j5       r0S, r1S9S-\\
\      4S. jjr2S9S/\\\"4   S0\\"   SS4U 4S1 jjjr3U 4S2 jr4   S>S3\\   S4\\   S5\\\
\   S4   4S6 jjr5S7r6U =r7$ )?GRPOTrainerf   a  
Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
Models](https://huggingface.co/papers/2402.03300).

Example:

```python
from datasets import load_dataset
from trl import GRPOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")


def reward_func(completions, **kwargs):
    # Dummy reward function that rewards completions with more unique letters.
    return [float(len(set(completion))) for completion in completions]


trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=reward_func,
    train_dataset=dataset,
)

trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function, such as:
            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
            path to a *directory* containing model weights saved using
            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
            keyword arguments in `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
            - A custom reward function: The function is provided with the prompts and the generated completions,
              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
              functions can also return `None` when the reward is not applicable to those samples. This is useful
              for multi-task training where different reward functions apply to different types of samples. When a
              reward function returns `None` for a sample, that reward function is excluded from the reward
              calculation for that sample. For more details, see [Using a custom reward
              function](#using-a-custom-reward-function).

              The trainer's state is also passed to the reward function. The trainer's state is an instance of
              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
              reward function's signature.
        - A list of reward functions, where each item can independently be any of the above types. Mixing different
        types within the list (e.g., a string model ID and a custom reward function) is allowed.
    args ([`GRPOConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
        ignored. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. The padding side must be set to "left". If `None`, the
        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
        `tokenizer.eos_token` will be used as the default.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
        `None`, the tokenizer for the model is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
        are ignored.
    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
trlgrpoNmodelreward_funcsargstrain_dataseteval_datasetprocessing_classreward_processing_classes	callbacks
optimizerspeft_configrK   c                 $  >^ TcO  [        U[        5      (       a  UOUR                  R                  nUR	                  S5      S   n[        U S35      mTR                  =(       d    0 n[        U[        5      (       a  UnUR                  S5      n[        U[        R                  5      (       d	  US:X  d  Uc  O9[        U[        5      (       a  [        [        U5      nXS'   O[        SU S35      e[        R                  " U5      n[        [        UR                  S   5      nUR                  " U40 UD6nO8UR                  R                  nTR                  b  [         R#                  S	5        [%        US
5      (       d8  [&        R(                  " UR*                  5      R,                  R/                  5       OE[&        R(                  " UR1                  5       R*                  5      R,                  R/                  5       U l        U
c$  [5        5       (       a!  [        U[6        5      (       a  [9        XT5      nUc*  [:        R                  " UR                  R                  5      n[        U[<        5      (       a  UR>                  nO#[        U[@        5      (       a  UnO[C        S5      eURD                  c  URF                  Ul"        URD                  U l"        URH                  U l$        URJ                  U l%        [        USS 5      U l&        [        USS 5      U l'        [        UR                  SS 5      U l(        [        UR                  SS 5      U l)        [        U[T        5      (       d  U/n/ U l+        [Y        U5       H  u  nn[        U[        5      (       a  [Z        R                  " U4SS0UD6UU'   [        UU   [\        R^                  5      (       aF  U RV                  Ra                  UU   R                  R                  R	                  S5      S   5        M  U RV                  Ra                  UU   Rb                  5        M     X l2        TRf                  b  [i        TRf                  5      [i        U5      :w  a.  [        S[i        TRf                  5       S[i        U5       S35      e[        Rj                  " TRf                  [        Rl                  S9U l3        O1[        Rn                  " [i        U5      [        Rl                  S9U l3        Uc  S /[i        U5      -  nO[        U[T        5      (       d  U/n[i        U5      [i        U5      :w  a$  [        S[i        U5       S[i        U5       S35      e[Y        [q        Xr5      5       H  u  nu  nn[        U[r        5      (       d  M   Uc*  [t        R                  " UR                  R                  5      nURH                  c  URF                  Ul"        URH                  UR                  l$        UUU'   M     Xpl;        TRx                  U l<        TRz                  U l=        TR|                  U l>        TR~                  U l?        TR                  U l@        TR                  U lA        TR                  U lB        TR                  U lC        TR                  U lD        TR                  U lE        TR                  U lF        TR                  U lG        TR                  U lH        TR                  U lI        TR                  U lJ        TR                  U lK        TR                  U lL        TR                  U lM        TR                  U lN        TR                  U lO        TR                  U lP        U R                  (       a  U R                  S:  a  [        S5      eU R                  (       a  U R                  S:X  d  [        S5      eTR                  U lR        [        U[        5      (       dO  [        U[        5      (       d:  [        U[        5      (       a0  [        S UR                  5        5       5      (       a  [        S5      eTR                  U lW        TR                  U lY        TR                  b  TR                  OTR                  U lZ        SU l[        S U l\        SUR                  S '   [        TU G]}  UT[        UUUUU	S!S"9	  TR                  U la        U R                  S#:X  a  S U lb        Oc[        U5      (       a  S U lb        OK[        R                  " U5      n[        [        UR                  S   5      nUR                  " U40 UD6U lb        TR                  (       a-  [        U5        U R                  b  [        U R                  5        U R                  (       a  [        5       (       d  [        S$5      e[        5       U li        [        U R                  U R                  U R                  U R~                  U R                  S#:g  U R                  U Rz                  S%9U lk        [        [T        5      [        [T        5      S&.U lm        SU ln        TR                  U lo        TR                  U lp        TR                  U lq        [        TR                  S'9[        TR                  S'9[        TR                  S'9[        U4S( j5      [        TR                  S'9S).U lt        [        TR                  SS*9  U R                  (       Ga`  [        5       (       d  [        S+5      eU R                  S,:X  a  U R                  R                  (       a  TR                  b  TR                  nOS-TR                   S.TR                   3n[        UTR                  S/9U l        U R                  GR                  [        GR                  GR                  5       S09  GO]U R                  S1:X  Ga3  U R                  GR                  U R                  -  S:X  d1  [        S2U R                   S3U R                  GR                   S435      eU R                  S:  a  [        GR                  GR                  G[        U R                  GR                  U R                  -  5       Vs/ s H6  n[U        G[        UU R                  -  US-   U R                  -  5      5      PM8     sn5      u  U l        n[        U R                  GR                  5      G[        GR                  S5'   [        U R                  GR                  5      G[        GR                  S6'   [        U R                  GR                  5      G[        GR                  S7'   G[        GR                  R                  S8S95      G[        GR                  S8'   G[        GR                  R                  S:S;5      G[        GR                  S:'   U Rx                  b'  U Rz                  b  U Rx                  U Rz                  -   nOS nG[        UGR                  TR                  U R                  U GR                  GR                  U R                  -  U GR                  GR                   -  US<U R                  GR                  U R                  -  S=U GR                  GR"                  U GR                  GR$                  S>9
U l        U GR                  GR$                  (       a  U GR&                  GR)                  SS?9  O[        S@U R                   SA35      eTGR*                  U l        SU l        U R                  GR1                  5         OU Rz                  SURH                  UGR2                  URJ                  U R~                  U R                  U R                  U R                  U R                  TGR4                  SB.nTR                  (       a  SCUSD'   SEUSF'   SGUSH'   TGR6                  b  UGR9                  TGR6                  5        G[;        SM0 UD6U l        SIU l        U GR@                  GRC                  U GRD                  5        U R                  b  U GRF                  (       a'  G[I        U R                  U R                  5      U lb        OcU GRJ                  (       a'  G[M        U R                  U R                  5      U lb        O*U R                  GRO                  U R                  SSJ9U lb        TGRP                  (       a/  U GRS                  G[U        U R                  U R                  SK95        [Y        U Rd                  5       H~  u  nn[        U[r        5      (       d  M  U GRF                  (       a&  G[I        UU R                  5      U Rd                  U'   MU  U R                  GRO                  USSSL9U Rd                  U'   M     g s  snf )NN/z-GRPOdtypeautozInvalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .r   zYou passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.get_base_modelzWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`image_tokenimage_token_idvision_start_token_idvision_end_token_id
num_labelsr7   zNumber of reward weights (z)) must match number of reward functions ()rc   z)The number of reward processing classes (z-) must match the number of reward functions (z).      ?zOLiger Kernels don't currently support masking token positions based on entropy.tokenzwLiger Kernels currently only support token-level importance sampling. Please set`importance_sampling_level` to 'token'.c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancer   ).0dss     R/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/grpo_trainer.py	<genexpr>'GRPOTrainer.__init__.<locals>.<genexpr>x  s     6wav[]z"o7V7Vavs   z^Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead.Testimate_tokensz!non-None value to disable scaling)	rV   rX   data_collatorrY   rZ   r[   r]   r^   compute_loss_func        zWLiger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.)betaepsilon_lowepsilon_hightemperatureuse_ref_model	loss_typemax_completion_length)trainevalmaxlenc                  *   > [        T R                  S9$ )Nr   )r   generation_batch_size)rX   s   ru   <lambda>&GRPOTrainer.__init__.<locals>.<lambda>  s    58R8R+S    )imageprompt
completionrewards
advantages)device_specificzkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutdevicecolocatezvllm_tensor_parallel_size (z) must divide world size (z	) evenly.RANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345external_launcheri   )
rV   tensor_parallel_sizegpu_memory_utilizationmax_num_seqsmax_model_lendistributed_executor_backendseedmax_num_batched_tokens
model_implenable_sleep_modelevelz6vllm_mode must be either 'server' or 'colocate', got 'z'.)max_new_tokens	do_samplepad_token_idbos_token_ideos_token_idr   top_ptop_kmin_prepetition_penaltycache_implementationi   max_batch_tokensi   
num_blocks   
block_sizeF)evaluation_mode)	ref_modelaccelerator)r   device_placement )rr   strconfig_name_or_pathsplitr9   model_init_kwargsgettorchrc   getattr
ValueErrorr   from_pretrainedtransformersarchitecturesloggerwarninghasattrinspect	signatureforward
parameterskeysrf   model_kwarg_keysr&   rL   r4   r   r   	tokenizerr   	TypeError	pad_token	eos_tokenr   r   rg   rh   ri   rj   listreward_func_names	enumerater   r   Moduleappend__name__rW   reward_weightslentensorfloat32oneszipr   r   r\   max_prompt_lengthr   num_generationsr   r   r   r   r   use_transformers_pageduse_vllm	vllm_modevllm_gpu_memory_utilizationvllm_tensor_parallel_size#vllm_importance_sampling_correctionvllm_importance_sampling_capuse_liger_lossr   scale_rewardsimportance_sampling_levelmask_truncated_completionstop_entropy_quantileNotImplementedErrorshuffle_datasetr   dictanyvaluesnum_iterationsepsilonr}   r~   _step_buffered_inputswarnings_issuedsuper__init__r?   r|   r   r   disable_dropoutr;   r0   ImportErrorr6   _forward_redirectionrM   liger_grpo_lossr   _metrics_total_train_tokenslog_completionswandb_log_unique_promptsnum_completions_to_printr   r   _logsr   r   r1   r   is_main_processvllm_server_base_urlvllm_server_hostvllm_server_portr/   vllm_server_timeoutvllm_clientinit_communicatorcudacurrent_devicenum_processesdistributednew_subgroups_by_enumerationrangetp_groupprocess_indexosenvironlocal_process_indexrN   name_or_pathrX   per_device_train_batch_sizesteps_per_generationvllm_model_implvllm_enable_sleep_modellmsleepvllm_guided_decoding_regexguided_decoding_regex_last_loaded_stepwait_for_everyoner   r   generation_kwargsupdater   generation_configmodel_accepts_loss_kwargsrV   add_model_tags
_tag_namesis_deepspeed_enabledr2   is_fsdp_enabledr3   prepare_modelsync_ref_modeladd_callbackr8   )selfrV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   
model_namer   model_idrc   r   architecturer   ireward_funcreward_processing_classr   _r   r%  	__class__s      `                     ru   r   GRPOTrainer.__init__   s    <",UC"8"8ell>X>XJ#))#.r2JE23D !228beS!!H%))'2E%--&EME3''u--2'* BBGK 
  //9F"<1E1Ea1HIL 00O=NOE||11H%%1? 5"233 emm,77<<>""5#7#7#9#A#ABMMRRT 	 "'8':':z%QZ?[?[&u4@E #,<<U\\=W=WX &77(22I(*ABB(Iuvv&"+"5"5I",,%22%22"#3]DI%&68H$O%,U\\;RTX%Y"#*5<<9NPT#U  ,--(>L!#'5NA{+s++"D"T"T#,-#1B#Q ,q/29955&&--l1o.D.D.R.R.X.XY\.]^`.ab&&--l1o.F.FG 6 ) *4&&'3|+<< 0T5H5H1I0J K""%l"3!4A7  #(,,t/B/B%--"XD"'**S->emm"TD %,)-\1B(B%5t<<)B(C%()S->>;C@Y<Z;[ \%%(%6$7r; 
 :C3G`Co9p5A5'+77*2.;.K.KKL^L^LlLl.m+*77?8O8Y8Y+5 3J2V2V""//F)!, :q *C& "&!7!7%)%?%?"#33++ZZ
ZZ
ZZ
"&"9"9&*&A&A#+/+K+K()-)G)G&373[3[0,0,M,M)"11!//)-)G)G&*.*I*I'$($=$=!4#<#<s#B%a  t'E'E'P%:   $33 }o66,88<..36wamatatav6w3w3w &p 
 #11<<151B1B1ND--TXT`T`
 !% 48/0"'%-! B 	 	
$ II	99!DN5!! "DN  //9F"<1E1Ea1HIL)99(XFWXDN $U+~~)(8 ,..!m  )<(=D%#;YY ,,!.. ,,"ii3...&*&@&@$D  #.d"3[=NO#$ #33(,(E(E%(,(E(E% $"<"<=4#=#=>t'A'AB"#STt'A'AB

 	D1===$&&!4 
 ~~)##3300<#'#<#<%,T-B-B,C1TEZEZD[#\'18X\XpXp'qD$$$66ejj>W>W>Y6Z:- ''558V8VVZ[[$5d6T6T5U V ,,::;9F 
 11A5 (-'8'8'U'U &+4+;+;+I+ITMkMk+k%l%l !q4+I+I'IAPQEUYUsUsKs!tu%l($DM1 &))9)9)G)G%H

6"+.t/?/?/S/S+T

<(+.t/?/?/M/M+N

<(,.JJNN=+,V

=),.JJNN=',R

=)))5$:T:T:`$($:$:T=W=W$WM$(M,,)-)G)G+/+K+K!%!F!F44"5ii44"5 #01D))774;Y;YY+/#yy88&*ii&F&F  99333HHNNN+ #YZ^ZhZhYiik!lmm *.)H)HD&%'D"
 ..0 #'"<"<! ) 6 6 ) 6 6 ) 6 6#//&*&=&=(,(A(A! **8;!"4526!,/25!,/%%1!(()?)?@%5%J8I%JD"
 */& 	

!!$//2>>%(((!24>>4CSCS!T%%%!-dnnd>N>N!O!%!1!1!?!?`d!?!e2T^^Y]YiYijk'(9(9:NA{+77,,,+<[$JZJZ+[D%%a( ,0+;+;+I+I#TD ,J ,D%%a( ;qs   ;=AH	c                 2    U R                   c
  SS/U l         g g )Nr   r   )_signature_columns)r0  s    ru    _set_signature_columns_if_needed,GRPOTrainer._set_signature_columns_if_needed^  s"    
 ""*'/&9D# +r   c                 x   U R                   c  [        S5      eU R                   nU R                  n[        5       (       a0  [	        U[
        R                  5      (       a  U R                  USS9nOU R                  USS9nU R                  U R                  R                  -  UU R                  R                  U R                  R                  U R                  R                  S.n[	        U[        R                   R"                  R$                  5      (       d  U R'                  5       US'   U R                  R(                  US'   [+        [,        U R                  R                  U R                  R.                  S9US'   U R                  R0                  US	'   U R2                  R5                  [7        U40 UD65      $ )
Nz+Trainer: training requires a train_dataset.training)description)
batch_size
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_last)rC  rankworker_init_fnprefetch_factor)rY   r   ry   r$   rr   datasetsr   _remove_unused_columns"_get_collator_with_removed_columns_train_batch_sizerX   r  dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersr   utilsdatar   _get_train_samplerdataloader_drop_lastr   r#   r  dataloader_prefetch_factorr   preparer   )r0  rY   ry   dataloader_paramss       ru   get_train_dataloader GRPOTrainer.get_train_dataloaderp  st   %JKK**** ""z-AQAQ'R'R 77S]7^M CCM_iCjM 004993Q3QQ'99;;))99"&))"I"I
 -)9)9)I)IJJ+/+B+B+Di(-1YY-K-Kk*29)I)IPTPYPYPgPg3./ 48993W3W/0''
=(VDU(VWWr   datasetreturnc           	         Uc  U R                   n[        UU R                  U R                  R                  U R                  -  U R
                  U R                  R                  -  U R                  U R                  R                  S9$ )N)data_sourcemini_repeat_countrA  repeat_countshuffler   )	rY   r:   r   rX   r   r   r  r   r   )r0  r[  s     ru   rT  GRPOTrainer._get_train_sampler  sq    2 ?((G"22yy66$:N:NN,,tyy/M/MM((
 	
r   c                 T    [        UU R                  U R                  R                  S9$ )N)r^  r_  r   )r:   r   rX   r   )r0  rZ   s     ru   _get_eval_samplerGRPOTrainer._get_eval_sampler  s&    $"22
 	
r   c	                 D   [        U5      (       a  UR                  R                  nX#S.n	Ub  Ub  XiS'   Ub  XYS'   Ub  XyS'   Ub  XS'   SU R                  ;   a  US-   U	S'   SU	S	'   UR                  " S0 U	D6R                  n
U
S S 2S S
2S S 24   n
U
S S 2U* S 2S S 24   n
U
$ )N	input_idsattention_maskimage_grid_thwpixel_valuespixel_attention_maskimage_sizeslogits_to_keepr7   F	use_cacherb   r   )r   
base_modelrV   r   last_hidden_state)r0  unwrapped_modelrh  ri  rn  rk  rj  rl  rm  model_inputsrq  s              ru   _get_last_hidden_state"GRPOTrainer._get_last_hidden_state  s     ))-88>>O &/Q %,*B-;)*#+7(+3G/0"*5' t444-;a-?L)*$)[!+11ALASS-a"ai8-a.1A1.DE  r   	entropiesmask	thresholdc                 j   XR                  5          R                  5       nUR                  5       S:X  a#  [        R                  " U[        R                   S9$ [        R
                  " UR                  5       /UR                  S9nU R                  R                  U5      R                  5       R                  5       n[        R                  " XdR                  5       -
  UR                  S9n[        R                  " XG/5      n[        R                  " [        R                  " U5      U/5      n	U R                  R                  U5      n
U R                  R                  U	5      nXR                  5          n[        R                  " X5      nXR                  5       -  nX:  nXR                  5       -  $ )ah  
Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold.

Args:
    entropies (`torch.Tensor`):
        Tensor of shape (batch_size, seq_len) with per-token entropy values.
    mask (`torch.Tensor`):
        Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding.
    threshold (`float`):
        Quantile threshold between `0.0` and `1.0` to select high-entropy tokens.

Returns:
    `torch.Tensor`:
        Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold
        and `False` otherwise.
r   rm   r   )boolfloatnumelr   
zeros_liker   r   r   r   maxitemzeroscat	ones_likequantile)r0  rv  rw  rx  non_pad_entropiesnon_pad_entropies_seq_length max_non_pad_entropies_seq_lengthpaddingpadded_entropiespadded_entropies_maskall_padded_entropiesall_padded_entropies_maskall_non_padded_entropiesentropy_thresholdmasked_entropiesentropy_masks                   ru   get_high_entropy_mask!GRPOTrainer.get_high_entropy_mask  sf   " &iik288:""$)##IUZZ@@
 (-||5F5L5L5N4OXaXhXh'i$+/+;+;+B+BC_+`+d+d+f+k+k+m(++,/F/F/HHQbQiQi
 !99&7%AB %		5??;L+Mw*W X#//667GH$($4$4$;$;<Q$R!#78V8V8X#Y !NN+CO$zz|3'<iik))r   c                    U=(       d    UR                  S5      n/ n/ n[        SUR                  S5      U5       GHz  nX-X-    nX=X-    nXS.nUbw  Ubt  XX-    US'   USU R                  S5      R                  5       R	                  5       nUSX-    R                  S5      R                  5       R	                  5       nUUU US'   OUb	  X}X-    US'   U	b	  XX-    US'   U
b	  XX-    US'   S	U R
                  ;   a  US
-   US	'   SUS'   U" S0 UD6R                  nUSS2SS2SS24   nUSS2U* S2SS24   nUU R                  -  nUSS2U* S24   n[        UU5      nUR                  U5        U(       d  GM@  [        R                  " 5          [        U5      nSSS5        UR                  W5        GM}     [        R                  " USS9nU(       a  [        R                  " USS9OSnUU4$ ! , (       d  f       N[= f)z<Compute log-probs and (optionally) entropies for each token.r   rg  Nrj  rb   rk  rl  rm  rn  r7   Fro  dimr   )sizer  prodsumr  r   logitsr   rE   r   r   no_gradr<   r  )r0  rV   rh  ri  rn  rA  compute_entropyrk  rj  rl  rm  	all_logpsall_entropiesstartinput_ids_batchattention_mask_batchrs  start_pixel_idxend_pixel_idxr  completion_idslogpsrv  s                          ru   "_get_per_token_logps_and_entropies.GRPOTrainer._get_per_token_logps_and_entropies  s>     49>>!#4
	1innQ/<E'0BCO#1%:L#M  *9aL)l.F1?HZ1[-."0%"8"="=b"A"E"E"G"L"L"N ./C1C D I I" M Q Q S X X Z/;OM/Z^,)/;EDV/W^,#/7KTYTf7g34&.9%BT.U]+  4#8#881?!1C-.(-L%*\*11FAssAI&FA/23F d...F,Q0@-@AN)&.AEU#]]_ 3F ;I %$$Y/U =X 		)+7FEIIm3D	i %_s   G11
G?	extra_prefixesc                 ^    U=(       d    / nS/U-   nU H  nUR                  US5      nM     U$ )Nz_checkpoint_wrapped_module. )replace)r0  namer  prefixesprefixs        ru   _fix_param_name_to_vllm#GRPOTrainer._fix_param_name_to_vllmQ  s8    '-212^CF<<+D r   moduler  c                 @   Uc
  [        5       nUR                  5        H%  u  pEU(       a  U SU 3OUnU R                  XVUS9  M'     [        U[        5      (       Ga1  [        R
                  " USSS9   UR                  5        H  u  pxU(       a  U SU 3OUn	U R                  U	S/S9n	X;   a  M-  UR                  U	5        U R                  S:X  aB  U R                  R                  (       a'  U R                  R                  XR                  5        M  U R                  S	:X  d  M  U R                  R                   R"                  R$                  R&                  R(                  n
U
R+                  XR                  4/5        M     SSS5        gg! , (       d  f       g= f)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nre   )r  visitedF)recurse	writebackz_fsdp_wrapped_module.r  r   r   )setnamed_children_sync_fsdp1_params_to_vllmrr   FSDPsummon_full_paramsnamed_parametersr  addr   r   r  r  update_named_paramrS  r  
llm_enginemodel_executordriver_workermodel_runnerrV   load_weights)r0  r  r  r  
child_namechild_modulechild_prefix
param_nameparam	full_name	llm_models              ru   r  &GRPOTrainer._sync_fsdp1_params_to_vllmX  s_    ?eG(.(=(=(?$J7=fXQzl3:L++7 ,  )@ fd##((%P)/)@)@)B%J<B6(!J< 8
I $ < <YXoWp < qI + KK	*~~1d6F6F6V6V((;;IzzR:5$(HH$7$7$F$F$T$T$a$a$g$g	!..JJ0G/HI *C QP $PPs   3B/F&AF
Fc                 4   UR                  5       R                  5        H  u  p#UR                  (       a%  UR                  [        R
                  " S5      5      nUR                  5       nU R                  S:X  a8  U R                  R                  (       a  U R                  R                  X#5        M  U R                  S:X  d  M  U R                  R                  R                  R                  R                   R"                  nUR%                  X#4/5        M     g )Nr  r   r   )
state_dictitemsis_cputor   r   full_tensorr   r   r  r  r  r  r  r  r  r  rV   r  )r0  r  r  r  r  s        ru   _sync_fsdp2_params_to_vllm&GRPOTrainer._sync_fsdp2_params_to_vllms  s    !,,.446KD||f!56%%'E~~)d.>.>.N.N  33D@:- HH//>>LLYY__	&&7 7r   c                 	   U R                   R                  R                  nUS L=(       a    UR                  S:H  nU(       a  SS KnUR
                  R                  nO[        n[        U R                  5      (       Ga0  U" [        U R                  R                  5       5      5         U R                  R                  5         U R                  (       a}  [        U R                   R                  SS 5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO]US:X  a  U R!                  U R                  5        GO:U R                  R#                  5        GH  u  pxUR%                  S5      R'                  SS	5      nU R                  R(                  U;   a  MC  S
U;   a  MK  U R+                  US/S9nU R,                  S:X  aB  U R                   R.                  (       a'  U R0                  R3                  XxR4                  5        M  U R,                  S:X  d  M  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        GM     U R                  RC                  5         S S S 5        GOU R                  (       a}  [        U R                   R                  SS 5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO%US:X  a  U R!                  U R                  5        GOU R                  R#                  5        H  u  pxU R+                  U5      nU" U/5         U R,                  S:X  aA  U R                   R.                  (       a&  U R0                  R3                  XxR4                  5        OkU R,                  S:X  a[  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        S S S 5        M     U R,                  S:X  a6  U R                   R.                  (       a  U R0                  RE                  5         g U R,                  S:X  a  U R6                  RE                  5         g g ! , (       d  f       N= f! , (       d  f       GM{  = f)N   r   fsdp_pluginfsdp_versionr7   r(   zbase_model.model.z.base_layerr  original_modulezmodules_to_save.default.r  r   r   )#r   statedeepspeed_plugin
zero_stage	deepspeedzeroGatheredParametersr   r   rV   r   r   merge_adapterr,  r   r  r  r  removeprefixr  r  r  r   r  r  r  rS  r  r  r  r  r  r  unmerge_adapterreset_prefix_cache)
r0  r  zero_stage_3r  gather_if_zero3r  r  r  r  r  s
             ru   _move_model_to_vllmGRPOTrainer._move_model_to_vllm  s     ++11BB't3X8H8S8SWX8X'nn??O)O$$ !djj&;&;&=!>?

((* '' #*$*:*:*@*@-QU"VKNY7;#J_`L#q(77 JJ &*77

C (,zz'B'B'D#001DEMMm]_`::,,4$,4$#;;DRlQm;n>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH (E  

**,C @?J ##%d&6&6&<&<mTRJUw{NAF[\1$33DJJ?!Q&33DJJ?#'::#>#>#@KD77=D(%1>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH 21 $A >>X%$*:*:*J*J//1^^z)HH'') *s @?^ 21s!   'E?R0*A:R05B=S0
R>
S	generation_batchc                 ^   U R                   R                  (       a  SOSnUS:X  a  U R                  R                  U R                  -  nU R
                  U-  S:X  d  U R                  cg  U R                  U5      n[        U5      n[        U5      n[        XR                  R                  5      nU Vs/ s H  n[        U5      PM     snU l        U R                  U R
                  U R                  R                  -     nU =R
                  S-  sl        U$ U R                  U5      nU$ s  snf )Nr   r   r   r7   )rV   r?  rX   r  r   r   r   _generate_and_score_completionsrG   rF   rH   rJ   )r0  r  modegenerate_everygeneration_batchesbatchinputss          ru   _prepare_inputsGRPOTrainer._prepare_inputs  s	   " **--w67?!YY;;d>Q>QQNzzN*a/43H3H3P#'#G#GHX#Y #=>N#O #89I#J %67GIgIg%h"Zl(mZlQV)Ee)LZl(m%**4::		8V8V+VWFJJ!OJ
  99:JKF )ns   3D*c           
        > U R                   R                  n[        R                  " [	        U5      [	        U R
                  5      US9nUS    Vs/ s H  owS;  d  M
  UPM     nnU VV	s0 s H  owU V	s/ s H  oU   PM	     sn	_M     n
nn	U R                  U
S'   [        [        U R
                  U R                  U R                  5      5       GHk  u  nu  pn[        X5         [        U[        R                  5      (       a  [        US   5      (       aE  [        X#5       VVs/ s H  u  nnSUU-   0PM     nnnU Vs/ s H  n[!        UU5      S   PM     nnO#[        X#5       VVs/ s H  u  nnUU-   PM     nnnU" USSS	S
S9n["        TU ]I  U5      n[        R&                  " 5          U" S0 UD6R(                  S S 2S4   US S 2U4'   S S S 5        O[U" SX#US.U
D6nU Vs/ s H  nUb  UO[        R*                  PM     nn[        R,                  " U[        R.                  US9US S 2U4'   S S S 5        GMn     [        R0                  " U5      R3                  SS9R5                  5       (       a  [        R0                  " U5      R3                  SS9R7                  SS9S   S   nU
R9                  5        VVs0 s H  u  nnUS:w  d  M  UUU   _M     nnnUU   US'   UU   US'   [:        R=                  SU S35        [?        U5      nU$ s  snf s  sn	f s  sn	nf s  snnf s  snf s  snnf ! , (       d  f       GN= fs  snf ! , (       d  f       GM  = fs  snnf )Nr   r   )r   r   r  trainer_statemessagestextptTrightFr  return_tensorsr  padding_sideadd_special_tokens)promptscompletionsr  rc   r   r7   r  )as_tupler   r   z=All reward functions returned None for the following kwargs:
zH
Please ensure that at least one reward function returns a valid reward.r   ) r   r   r   r  r   rW   r  r   r   r\   r   r-   rr   r   r   r*   r)   r   r  inference_moder  nanr   r   isnanallr   nonzeror  r   r   r   )r0  r  r  r  completion_ids_listr   rewards_per_funckeyr   examplereward_kwargsr4  r5  r6  reward_func_namepcr  xtextsreward_inputsoutput_reward_funcrewardnan_row_idxvaluerow_reward_kwargsr8  s                             ru   _calculate_rewardsGRPOTrainer._calculate_rewards  so   !!(( ;;s7|S9J9J5KTZ[  &ayby7a,aybNRSds6B6s|6BBdS *.o&KT!!4#A#A4CYCYZL
GAG6F #4:k29955(33DGD]#^D]DAqZQ$7D]#^bj kbj]^!4Q8O!PQW!Xbj k36w3L M3L41aQ3L M$;"4T[pu%M %*G$;M$JM--/1<1M}1M1T1TUVXYUY1Z(A. 0/ *5 * 'Qd*hu*& ew)wdvZ`F4F&EII*Udv&)w-2\\:LTYTaTajp-q$QT*) ;:L
4 ;;'(,,,37799++&67;;;BJJTXJYZ[\]^_K:G:M:M:O!:OJCSVZiSi'U;'':O  ! +2+*>h'.9+.Fl+NNPQbPc dZ Z ""23_ cBS $_ k M
 0/ *x% ;:2!s   	LL+
L5LL%AM&L#
:ML)M+L.
=1M.!L4M'M-M?M
ML#M4
M>M
M	r  c                 >  > U R                   R                  nU R                  R                  (       a  SOSnU Vs/ s H  oDS   PM	     nn[        R
                  " U5      n0 nSUS   ;   nU(       a_  U V	s/ s H  oR                  S5      PM     n
n	SU
 Vs/ s H  o/PM     sn0nU H$  n[        U[        5      (       d  M  [        USS9  M&     U V	s/ s H  n	[        XR                  5      S   PM     nn	U R                  " S`US	S
SSS.UD6n[        TdU ]5  U5      nUS   US   nnU R                  Gb  U R                  U R                   U R"                  /nU Vs/ s H
  nUc  M  UPM     nn[%        UUU R                  U5      u  nnU R                  R'                  USSS9nU Vs/ s H=  n[(        R*                  " S[(        R,                  " U R.                  5       S3SU5      PM?     nnU R0                  Gb3  [(        R,                  " U R0                  5      n[(        R2                  " UU R                  R4                  5      (       a7  U Vs/ s H)  n[(        R*                  " SU S3U R0                  U5      PM+     nnOU R"                  bs  [(        R,                  " U R                  R6                  R9                  U R"                  /5      5      nU Vs/ s H!  n[(        R*                  " SU SU 3SU5      PM#     nnO,U Vs/ s H  n[(        R*                  " SU S3SU5      PM!     nnU R:                  (       Ga  U R<                  S:X  aS  U R>                  R@                  (       a8  [B        RD                  RG                  5         U RH                  RK                  5         U RL                  RN                  U RP                  :w  a+  U RS                  5         U RL                  RN                  U l(        U R<                  S:X  Ga  [U        U5      nU(       a  [U        W
5      nU R                   RV                  (       a  US S U RX                  2   nU(       a  WS S U RX                  2   nOS n[[        U S5         U R\                  R_                  UUU RX                  U R`                  U Rb                  U Rd                  U Rf                  c  SOU Rf                  U Rh                  c  SOU Rh                  U Rj                  U Rl                  U R>                  Rn                  S9nUS   US   4nS S S 5        OS nW/n[q        USS9  US   u  nn[s        U R                   Rt                  [w        U5      -  U R                   Rt                  S-   [w        U5      -  5      nUU   nUU   nGOU R<                  S:X  Ga  U Rl                  (       a  [y        U Rl                  S9n OS n SU R`                  U Rb                  U Rd                  U Rf                  c  SOU Rf                  U Rh                  c  SOU Rh                  U Rj                  U SS.	n!U R>                  Rn                  b%  U!R{                  U R>                  Rn                  5        [}        S`0 U!D6n"U R~                  S:  a  [w        U5      n#[        U R~                  5       V$s/ s H  n$S PM     n%n$[B        R                  R                  U%XR                  S 9  U% V&V's/ s H  n&U&  H  n'U'PM     M     nn&n'U(       al  [        U R~                  5       V$s/ s H  n$S PM     n(n$[B        R                  R                  U(W
U R                  S 9  U( V&Vs/ s H  n&U&  H  oPM     M     nn&nOS nOUnU(       a  W
OS nU(       aL  U(       aE  / n)[        UU5       H2  u  nn*U*b  U)R                  USU*0S!.5        M!  U)R                  U5        M4     OUn)[[        U S5         U RH                  R_                  U)U"SS"9n+S S S 5        W+ V,Vs/ s H#  n,U,R                    H  nUR                  PM     M%     nn,nU+ V,VV-s/ s H^  n,U,R                    HJ  nUR                   V-s/ s H/  n-[        [        U-R                  5       5      5      R                  PM1     sn-PML     M`     nnn,n-U R~                  S:  aF  [B        R                  R                  U R                  S 9n.[s        U.W#-  U.S-   U#-  5      n/UU/   nUU/   nU R>                  R@                  (       a  U RH                  R                  SS#9  W V0s/ s H  n0[B        R                  " U0US$9PM     nn0[        UU R                  S%9n[B        R                  " UU/SS&9n1W V2s/ s H'  n2[B        R                  " U2U[B        R                  S'9PM)     n3n2[        U3SS%9n3GOU R                  (       Ga  U R                  " S`S(U0UD6n4U R                  R                  R                  n5[        5       (       a  S)U R                  R                  lW        OS*U R                  R                  lW        [[        U S+5         [        U R                  U R                   U R>                  R                  S,9 n6[B        R                  " 5          U R                  (       a  [        R                  " U R                  SS-9O	[        5          U R>                  R                  (       a   U6R                  [B        R                  5        O:U R>                  R                  (       a  U6R                  [B        R                  5        [B        R                  " 5          U6R                  U4R                  U R                  SS.9n+S S S 5        S S S 5        S S S 5        S S S 5        S S S 5        W+R                  5        Vs/ s H  nUR                  PM     nnU V0s/ s H  n0[B        R                  " U0US$9PM     nn0[        UU R                  S/S09nU4R                   V0s/ s H  n0[B        R                  " U0US$9PM     nn0[        XR                  SS09n[B        R                  " UU/SS&9n1U5U R                  R                  lW        GO[[        U S15         [        U R                  U R                   U R>                  R                  S,9 n6[B        R                  " 5          U R                  (       a  [        R                  " U R                  SS-9O	[        5          UUsUS'   US'   U6R^                  " S`0 UDU R                  S
S2.D6n1S S S 5        S S S 5        S S S 5        S S S 5        UR                  S5      n7W1S S 2S U724   nU1S S 2U7S 24   nUU R                  :H  n8[B        R                  " U8R                  S5      4U8R                  S5      [B        R                  US39n9U8R                  5       R                  SS&9U8R                  SS&9   U9U8R                  SS&9'   [B        R                  " U8R                  S5      US$9R                  U8R                  S5      S5      n:U:U9R                  S5      :*  R                  5       n;[        UU;R                  5       5       V<V=s/ s H  u  n<n=U<U=   R                  5       PM     n>n<n=U;R                  S5      n?U R                   R                  U?5      n@U@R                  5       nAU R                  (       a3  U8R                  SS&9) nBU;UB) R                  S5      R                  5       -  n;[B        R                  " UU;/SS&9nCUR                  S5      nDUS:X  a  U R>                  R                  OU R>                  R                  nE[B        R                  " 5          U R>                  R                  U R                  -  nFU R>                  R                  UF-  S:w  d"  U R:                  (       ar  U R                  (       aa  U R                  U R                  U1WCWDWEUR                  S45      UR                  S55      UR                  S65      UR                  S75      S89	u  nGn$OS nGU R:                  (       aL  U R                  (       a;  [B        GR                   " WGW3-
  5      nH[B        GR                  " UHU GR                  S99nHU GR                  S:w  Ga  U GR                  bb  U R                  U GR                  U1WCWDWEUR                  S45      UR                  S55      UR                  S65      UR                  S75      S:9	u  nIn$OU R                   GR                  U R                  5      GR                  5          U R                  U R                  U1WCWDWEUR                  S45      UR                  S55      UR                  S65      UR                  S75      S:9	u  nIn$S S S 5        OS nIS S S 5        U R                  R'                  US
S;9nJG[        US   5      (       aS  / nK[        UWJ5       H@  u  nnLUS   S<   S=:X  a  UGR                  5       S>   OSnMWKR                  S=UMWL-   S?./5        MB     OWJnKU GR                  XWKU>5      nNUNU GR                  R                  U5      R                  S5      -  GR                  SS&9nOUOGR                  SU RX                  5      GR                  SS&9nPUPGR                  U RX                  SS&9nPUOUP-
  nQU GR                  S@;   aG  WOGR                  SU RX                  5      GR!                  SS&9nRURGR                  U RX                  SS&9nRONU GR                  SA:X  a"  WOGR!                  5       GR#                  UO5      nROG[%        SBU GR                   SC35      e[B        GR&                  " WR[B        GR(                  " UR5      5      nSU GR                  SD:w  a  WQWRSE-   -  nQ[s        U R                   Rt                  [w        U5      -  U R                   Rt                  S-   [w        U5      -  5      nWQGR+                  5       nTUQU   nQUS:X  ad  U RL                  =GR,                  U R                   R                  WCR                  5       5      R                  5       GR/                  5       -  sl        U RL                  GR,                  /U GR0                  U   SF'   U GR0                  U   SG   R                  W@GR3                  5       GR                  5       GR/                  5       5        U GR0                  U   SH   R                  U@GR3                  5       GR5                  5       GR/                  5       5        U GR0                  U   SI   R                  U@GR3                  5       GR7                  5       GR/                  5       5        U R                   R                  U8R                  SS&95      nUU@UU   nVS[w        UV5      [w        U@5      -  -
  nWU GR0                  U   SJ   R                  UW5        [w        UV5      S:X  a  [B        GR8                  " SUS$9nVU GR0                  U   SK   R                  WVGR3                  5       GR                  5       GR/                  5       5        U GR0                  U   SL   R                  UVGR3                  5       GR5                  5       GR/                  5       5        U GR0                  U   SM   R                  UVGR3                  5       GR7                  5       GR/                  5       5        G[;        U GR<                  5       H  u  nXnY[B        GR>                  " WNS S 2UX4   5      GR/                  5       nZU GR0                  U   SNUY SO3   R                  UZ5        G[A        UNS S 2UX4   5      GR/                  5       n[U GR0                  U   SNUY SP3   R                  U[5        M     U GR0                  U   SQ   R                  WPGR                  5       GR/                  5       5        U GR0                  U   SR   R                  WRGR                  5       GR/                  5       5        U GR0                  U   SS   R                  WSGR3                  5       GR                  5       GR/                  5       5        U GRB                  S   GRE                  [U        U5      5        U GRB                  ST   GRE                  [U        WJ5      5        G[;        U GR<                  5       H>  u  nXn\U GRB                  SU   U\   GRE                  WNS S 2UX4   R                  5       5        M@     U GRB                  SV   GRE                  WTR                  5       5        U(       a)  U GRB                  S   GRE                  [U        W
5      5        U R:                  (       GaJ  U R                  (       Ga8  [B        GRF                  " WGW3-
  5      n]U]U;R                  5          n]U]GRI                  5       S:  a  [B        GR                  " W]5      O[B        R                  " SUS$9n^W]GRI                  5       S:  a  [B        GR6                  " W]5      O[B        R                  " SUS$9n_U GR0                  U   SW   R                  U R                   R                  W^5      GR                  5       GR/                  5       5        U GR0                  U   SX   R                  U R                   R                  U_5      GR7                  5       GR/                  5       5        WHU;R                  5          n`U`GRI                  5       S:  a  [B        GR4                  " W`5      O[B        R                  " SUS$9naW`GRI                  5       S:  a  [B        GR                  " W`5      O[B        R                  " SUS$9nbW`GRI                  5       S:  a  [B        GR6                  " W`5      O[B        R                  " SUS$9ncU GR0                  U   SY   R                  G[K        U R                   R                  Wa5      5      GR/                  5       5        U GR0                  U   SZ   R                  U R                   R                  Wb5      GR?                  5       GR/                  5       5        U GR0                  U   S[   R                  G[M        U R                   R                  Uc5      5      GR/                  5       5        UUUU;WQWAS\.nWGb  WGUS]'   U R:                  (       a  U R                  (       a  WHUS^'   WIb  WIUS_'   S4U;   a  US4   US4'   S5U;   a  US5   US5'   S6U;   a  US6   US6'   S7U;   a  US7   US7'   U$ s  snf s  sn	f s  snf s  sn	f s  snf s  snf s  snf s  snf s  snf ! , (       d  f       GN= fs  sn$f s  sn'n&f s  sn$f s  snn&f ! , (       d  f       GN;= fs  snn,f s  sn-f s  sn-nn,f s  sn0f s  sn2f ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN = fs  snf s  sn0f s  sn0f ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= fs  sn=n<f ! , (       d  f       GN%= f! , (       d  f       GN/= f)aNr   r   r   r   r   imagesr7   )
num_imagesr  TleftFr  rh  ri  )skip_special_tokensclean_up_tokenization_spacesz^(z)+r  (r   r   zvLLM.generaterb   r{   )r  r  nr   r   r   r   r   
max_tokensr"  r%  r  logprobs)from_process)regex)	r  r   r   r   r   r   r  guided_decodingr  )group)r   multi_modal_data)sampling_paramsuse_tqdmr   r   )padding_valuer  )r   rc   r  paged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r  )r'  progress_barr  )r%  r  ztransformers.generate)r'  disable_compiler  rk  rj  rl  rm  )rk  rj  rl  rm  r~  )rA  rk  rj  rl  rm  )r  role	assistantcontent)r,  r.  )r!  noner  z!Invalid value for scale_rewards: z-. Must be one of 'batch', 'group', or 'none'.r/  g-C6?
num_tokenszcompletions/mean_lengthzcompletions/min_lengthzcompletions/max_lengthzcompletions/clipped_ratioz"completions/mean_terminated_lengthz!completions/min_terminated_lengthz!completions/max_terminated_lengthzrewards/z/meanz/stdr  
reward_stdfrac_reward_zero_stdr   r   r   z&sampling/sampling_logp_difference/meanz%sampling/sampling_logp_difference/maxz&sampling/importance_sampling_ratio/minz'sampling/importance_sampling_ratio/meanz&sampling/importance_sampling_ratio/max)
prompt_idsprompt_maskr  completion_maskr   num_items_in_batchold_per_token_logpsimportance_sampling_ratioref_per_token_logpsr   )r   r   rV   r?  copydeepcopyr   rr   r   r,   r+   r[   r   r  r   rh   ri   rj   rI   batch_decoderesubescaper   rg   searchchat_templater   decoder   r   rX   r  r   r  empty_cacher  wake_upr  global_stepr#  r  r   r  r   r-   r  generater   r   r   r   r   r   r"  r%  r   slicer  r   rP   r&  rO   r   r  r  all_gather_objectr  r   r   outputs	token_idsr  nextiterr   logprobget_rankr   r   rC   r   r  r   r   model_wrappedr   _attn_implementationr%   r5   ds3_gather_for_generationr  r,  r  r  r   bf16r  bfloat16fp16float16r  generate_batchrh  r'  generated_tokensr  r   fulllongintargmaxr   arangeexpand	unsqueezerz  tolistr  r   r   r  per_device_eval_batch_sizer  r   gradient_accumulation_stepsr   r  expclampr   r|   r   unwrap_modeldisable_adapterr*   popr  r   nansumviewmeanrepeat_interleaver   std	expand_asr   iscloser}  clonenum_input_tokens_seenr  r  r{  minr~  r  r   r   nanmeanrB   r  extendabsr|  rA   r@   )er0  r  r   r  r
  r  original_promptskwargs
has_imagesr  r  imgr   prompts_textprompt_inputsr3  r4  	protectedro   r  escaped_img_tokenescaped_eoi_tokenall_prompts_text
all_imagesordered_set_of_promptsordered_set_of_imagesoutputpayloadobj_listr  all_logprobsprocess_slicer   r%  r#  	orig_sizer7  gathered_promptssublistr  gathered_imagesvllm_inputsr   all_outputsrI  lplocal_rank_in_grouptp_sliceidsprompt_completion_idsr  sampling_per_token_logpspaged_prompt_inputsprevious_attnrr  prompt_lengthis_eoseos_idxsequence_indicesr5  rowmask_rowr  completion_lengthsagg_completion_lengthsr6  truncated_completionsri  rn  rA  r  r7  r8  r9  completions_textr  r   	bootstrapr  r   mean_grouped_rewardsr   std_rewardsis_std_zeroall_process_advantagesagg_terminated_with_eosterm_completion_lengthsclipped_completions_ratior4  r  mean_rewardsstd_func_rewardsr  delta
mean_delta	max_deltaflat_is_ratiomin_importance_sampling_ratiomean_importance_sampling_ratiomax_importance_sampling_ratior8  se                                                                                                       ru   r  +GRPOTrainer._generate_and_score_completions&  s     !!((**--w6(./1X;/
  ==1
 q	)
:@A&wkk'*&FA& 9&3& 9:F!fd++/1E " lrrkq`g1';P;PQRZ[kqr-- 
$
 
 />"/"<mL\>]K
!!- ,,d.H.H$JbJbcI,5KI5IIK&DK)?)?'#J  00==TY > L _kk^jVZBFFb4>>)B(C2#FDQ^jLk +$&IId.>.>$?!99.0E0E0S0STT_k$_kW[!$5#6b94;K;KTR_k ! $L
 //;,.II 11;;BBDD\D\C]^-) iu(ht`dBFFa(9':"=N<O#PRTVZ[ht % (
 bn'namY]!4E3Fb/I2t(Tam'n ===~~+		0P0P

&&(  " zz%%)?)??((*)-)?)?& ~~)#0#> !.v!6J##33 .>>UAUAU>U-V*!0:;Rd>R>R;R0S-04-*4A!%!1!1!:!:$:#8"22/3/F/F(,(8(8"&**(,

(:"

)-);#'+'A'A262L2L.2ii.I.I "; " $**:#;VJ=O"P BA  #G $9%hQ?/7{, %$$22S\A%%33a73w<G! "0!>+M: :---&:A[A[&\O&*O *.*A*A#'#3#3!ZZ#'::#5R4::$(JJ$6SDJJ"&"<"<'6 !
%! 99..:%,,TYY-H-HI"0"E3D"E11A5 !$L 1I6;D<Z<Z6['\6[6[$'\%%778H,^k^k7l9I'[9IgSZaSZ9I$'[!9>t?]?]9^*_9^A49^*_));;OV[_[h[h;i9H%\gT[ScT[c
%\
%)
'3$+54J*"$K),-=z)J ,'..&W^`eVf/gh'..v6	 *K #3K&t_="&(("3"3KQ`kp"3"qK > CN!l+w\c\k\kRX&"2"2\k"2+!l $/ #.")// @FOT$ryy{+,44O"1 P#.    11A5 +0*;*;*D*D4==*D*Y'$%89%DGZ]^G^bkFklH%3H%=N#/#9L9933HHNNN+ KYY.3ell3v>.NY t?P?PQN$)IIz>.JPQ$R![g([gxXfEMMJ[g % ( (++CSV'W$((( #'"7"7"T\"TV"T ..55JJM(**AR""))>AM""))>!$(EF+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuu 99>>#&&u~~6YY^^#&&u}}5))+"1"@"@+55I_I_ns #A #K , v   G  EPDVDVDXYDX&f55DXNYJXY.3ell3v>.NY t?P?P_fgNFYFcFcdFcs%,,s6:FcJdZ7H7HW]^J$)IIz>.JPQ$R!=JD%%: "$(?@+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuuNXZeKk*M:J,K(7(@(@ )#)7;7M7M_c)% v   A 'OOA.M.q.=./@AJ21mn3DEN  4#4#44**fkk!n.AejjY_`%+ZZ\%8%8Q%8%?

q
@Q%R

q
!" <<AvFMMfkkZ[n^`a+w/@/@/CCHHJ LO~_n_s_s_uKvwKv-#xs8}335Kvw -003!%!1!1!8!89K!L3779 **%+ZZAZ%6$6!-2G1G0R0RST0U0Y0Y0[[O K#AqI',,Q/>BgoTYY::SWS\S\SwSw
]]_ "YY;;d>Q>QQNyy44~EJ$"J"J)-)P)PJJ)""!.!2!2>!B#0#4#45E#F)6):):;Q)R - 1 1- @ *Q 
*&#Q '+# }}!I!I,1II6ILd6d,e),1KK-43T3T-)
 yyC>>--1-T-T-&&#-%2%6%6~%F'4'8'89I'J-:->->?U-V$1$5$5m$D .U 
.*' ))66tzzBRRT151X1X JJ1**'1)6):):>)J+8+<+<=M+N1>1B1BCY1Z(5(9(9-(H 2Y 
2.+Q UT '+#{ @  00==nbf=gVAY''K&)'3C&D"
7=bz&7I[7XFJJL3^`	""[YQ[E[$\#]^ 'E +K
  226[Zmn $d&9&9&<&<V&D&N&Nq&QQYY^_Y`  '||B0D0DEJJqJQ  4EEdFZFZ`aEb33
!22!,,r4+?+?@DDDKK%778L8LRS7TK7*!++-11':K3D4F4F3GGtu  mmK1A1A+1NO'#{T'9:J **S\9++a/3w<?
 ",!1!1!3.
 7?JJ,,0@0@0G0GHZHZH\0]0a0a0c0h0h0jj,-1ZZ-M-M,NdL) 	d56==>T>Z>Z>\>a>a>c>h>h>jkd45<<=S=Y=Y=[=_=_=a=f=f=hid45<<=S=Y=Y=[=_=_=a=f=f=hi #'"2"2"9"9&***:K"L"89P"Q$%,C(DsKaGb(b$b!d78??@YZ&'1,&+kk!F&C#d@AHHI`IfIfIhImImIoItItIvwd?@GGH_HeHeHgHkHkHmHrHrHtud?@GGH_HeHeHgHkHkHmHrHrHtu $-T-C-C#DA ==)9!Q$)?@EEGLMM$(+;*<E BCJJ<X%&6q!t&<=BBDMM$(+;*<D ABIIJZ[	 $E
 	dH%,,-A-F-F-H-M-M-OPdL)001A1A1C1H1H1JKd23::;;L;L;N;S;S;U;Z;Z;\] 	

8##M,$?@

< ''6F(GH !7!78GAtJJy!$'../?1/E/L/L/NO 9

< ''(>(E(E(GHJJw&&}V'<====TEEEII14LLME/..01E.3kkma.?E*U\\RU^dEeJ,1KKMA,=		%(5<<PS\bCcIMM$ HIPP  ''
388:??A MM$ GHOO  ''	2668==? 6o6J6J6LMM,9,?,?,AA,E		-(5<<X[djKk * .;-@-@-BQ-F

=)ELLY\ekLl + -:,?,?,AA,E		-(5<<X[djKk * MM$ HIPPt''../LMNSSU MM$ IJQQ  ''(FGOOQVVX MM$ HIPPt''../LMNSSU
 %&,.$"4
 *,?F()==TEE2KF./*,?F()]*%2>%BF>"},'45E'FF#$!]2-:;Q-RF)*M)$1-$@F=!k 0 B 9
 s$ L l$(
 (o> BAp (]'[ +`%\" >= "mO $ Z(8 ,+ vu    GF  ZYd vu    A@2 xD UTa _s  Aw/6Aw4Aw9"Aw>Ax*Ax,AAx0Ax(Ax&Ax:B0Ax<Ax.7Ax3.Ax9*Ax>4Ay*Ay$Ay!46Ay*Ay!Ay(.Ay-+4Az:Az(5;Az0BAz	;&Ay2!Az	)Az1Az(9Az:A{7A{;A{04A|$A{?:;A{-5-A{	"A{-*A{?2A|; A|#GA|;F'A!A|)HA|;x
Ax+y
AyyAy!y2
Azy<Az	z
AzzAzz
Az%z Az(z(
Az7	z2Az:z:
A{	{
A{*{%A{-{-
A{<{7A{?{?
A|	|	A||
A| |)
A|8	|3A|;|;
A}
c                    US   US   pCUS   US   pe[         R                  " X5/SS9n[         R                  " XF/SS9nUR                  S5      n	U R                  UUUU	UR	                  S5      UR	                  S5      UR	                  S	5      UR	                  S
5      5      n
U R                  U
UR                  R                  UUUS   UR                  R                  UR	                  S5      UR	                  S5      S9u  pU R                  S:w  a  US   OS nUS   nU R                  R                  (       a  SOSnU R                  S:w  aV  U R                  U   S   R                  U R                  R                  U5      R!                  5       R#                  5       5        U R                  U   S   R                  U R                  R                  U5      R!                  5       R#                  5       5        XR$                  -  $ )Nr3  r4  r  r5  r7   r  rk  rj  rl  rm  r   r7  r9  )_input
lin_weightselected_token_idsri  r   biasr7  r9  r{   r   rb   r   r   kl
clip_ratio)r   r  r  rt  r   r  lm_headweightr  r|   rV   r?  r  r   r   r   ri  r  #current_gradient_accumulation_steps)r0  rr  r  r3  r4  r  r5  rh  ri  rn  rq  lossmetricsmean_klr  r  s                   ru   compute_liger_lossGRPOTrainer.compute_liger_loss#  s   "("6}8MK*01A*BFK\D]IIz:B	K#AqI',,Q/ !77JJ~&JJ'(JJ-.JJ}%	
 ,,$&..55-*l+ ((-- &

+@ A &

+@ A - 	
 !%		S 0'!*dR[
**--w699MM$%,,T-=-=-D-DW-M-R-R-T-Y-Y-[\dL)001A1A1H1H1T1Y1Y1[1`1`1bc>>>>r   c                     U(       a  [        S5      eU R                  (       a8  U R                  R                  U5      nU R	                  XU R
                  XR5      $ U R                  X5      $ )Nz2The GRPOTrainer does not support returning outputs)r   r   r   rd  r   r  _compute_loss)r0  rV   r  return_outputsr6  rr  s         ru   compute_lossGRPOTrainer.compute_lossM  s]    QRR"..;;EBO,,UTE\E\^mvv%%e44r   c                   ^&^' US   US   pCUS   US   snm&[         R                  " X5/SS9n[         R                  " UT&/SS9nUR                  S5      nU R                  UUUUSUR	                  S5      UR	                  S	5      UR	                  S
5      UR	                  S5      S9	u  pU R
                  S:  a!  U R                  U
T&SU R
                  -
  5      nOS nU R                  S:w  a%  US   n[         R                  " X-
  5      X-
  -
  S-
  nUS   nUR	                  S5      nUc  U	R                  5       OUnX-
  nU R                  S:X  a  UnOnU R                  S:X  aE  UT&-  R                  S5      T&R                  S5      R                  SS9-  nUR                  S5      nO[        SU R                   S35      e[         R                  " U5      n[         R                  " USU R                  -
  SU R                   -   5      nU R"                  R$                  b)  [         R                  " UU R"                  R$                  S9nUUR                  S5      -  nUUR                  S5      -  n[         R&                  " UU5      * nUb  UU-  nU R(                  (       a  U R*                  (       a  UUS   -  nU R                  S:w  a  UU R                  W-  -   nU R,                  S:X  aQ  UT&-  R                  S5      T&R                  S5      R                  SS9-  R/                  5       nUU R0                  -  nOU R,                  S:X  aA  UT&-  R                  5       T&R                  5       R                  SS9-  nUU R0                  -  nOU R,                  S:X  aB  UT&-  R                  5       UR                  S5      U R2                  -  -  nUU R0                  -  nO[U R,                  S:X  a3  US   U R4                  R6                  -  nUT&-  R                  5       U-  nO[        S U R,                   35      eU R8                  R:                  (       a  S!OS"nT&R                  5       R                  SS9m'U&U'4S# jnU R                  S:w  a^  U" W5      nU R<                  U   S$   R?                  U R4                  RA                  U5      RC                  5       RE                  5       5        U" U
5      nU R<                  U   S%   R?                  U R4                  RA                  U5      RC                  5       RE                  5       5        USU R                  -
  :  UR                  S5      S:  -  nUSU R                   -   :  UR                  S5      S:  -  nUU-  nU" URG                  5       5      n U" URG                  5       5      n!U" URG                  5       5      n"U R4                  RA                  U 5      n#U R<                  U   S&   R?                  U#RC                  5       RE                  5       5        U R<                  U   S'   R?                  [I        U#5      RE                  5       5        U R4                  RA                  U!5      n$U R<                  U   S(   R?                  U$RC                  5       RE                  5       5        U R<                  U   S)   R?                  [K        U$5      RE                  5       5        U R4                  RA                  U"5      n%U R<                  U   S*   R?                  U%RC                  5       RE                  5       5        U$ )+Nr3  r4  r  r5  r7   r  Trk  rj  rl  rm  )r  rk  rj  rl  rm  rn   r{   r9  r   r7  ro   sequencerb   )rp  z#Unknown importance sampling level: z-. Possible values are 'token' and 'sequence'.r+  r8  rU   bnpodr_grpor   dapor6  zUnknown loss type: r   r   c                 v   > U R                   S   S:X  a  U R                  5       $ U T-  R                  5       T-  $ )Nr7   )shaperi  r  )r
  r5  completion_token_counts    ru   masked_batch_mean4GRPOTrainer._compute_loss.<locals>.masked_batch_mean  s7    wwqzQvvxO+0025KKKr   r  entropyzclip_ratio/low_meanzclip_ratio/low_minzclip_ratio/high_meanzclip_ratio/high_maxzclip_ratio/region_mean)&r   r  r  r  r   r   r  r|   rb  detachr   r  rc  r^  r   r}   r~   rX   r  rp  r   r   r   ri  r  r   r   r  rV   r?  r  r   r   rq  r  r{  rA   r@   )(r0  rV   r  r3  r4  r  rh  ri  rn  per_token_logpsrv  r  r9  per_token_klr   r7  	log_ratiolog_importance_weightscoef_1coef_2per_token_loss1per_token_loss2per_token_lossr  
normalizerr  r  r  mean_entropyis_low_clippedis_high_clippedis_region_clippedlow_clip	high_clipr  gathered_low_clipgathered_high_clipgathered_clip_ratior5  r  s(                                         @@ru   r  GRPOTrainer._compute_lossX  s   "("6}8MK*01A*BFK\D]'IIz:B	K#AqI',,Q/ &*%L%L N3!::&67!',B!C

=1 &M 
&
" $$s*55iRSVZVoVoRopLL 99"()>"?		-?@DWDijmnn 
 L)
 %jj)>?:M:Uo446[n#9	))W4%."++z9&//&A%F%Fr%J_M`M`acMdMjMjorMjMs%s"%;%E%Eb%I"5d6T6T5U V" "  12VQ)9)9%91t?P?P;PQ 99??&[[TYY__=F :#7#7#:: :#7#7#::))O_EE#+l:N==TEE+f5P.QQN99+dii,.FFN>>V##o5::2>ATATUWAXA^A^cfA^AggmmoD$BBBD^^v%"_499;o>Q>Q>S>Y>Y^a>Y>bbD$BBBD^^y("_499;~?R?RST?UX\XrXr?rsD$BBBD^^v% 458H8H8V8VVJ"_499;jHD24>>2BCDD **--w6!0!4!4!6!<!<!<!E	L 99'5GMM$%,,T-=-=-D-DW-M-U-U-W-\-\-^_(3dI&--d.>.>.E.El.S.[.[.].b.b.de !1t'7'7#77J<P<PQR<SVW<WX!A(9(9$99j>R>RST>UXY>YZ*_<$^%9%9%;<%o&;&;&=>	&'8'>'>'@A
 ,,33H=d1299:K:S:S:U:Z:Z:\]d0188@Q9R9W9W9YZ!--44Y?d23::;M;U;U;W;\;\;^_d1299&AS:T:Y:Y:[\"..55jAd45<<=P=X=X=Z=_=_=abr   ignore_keysc                 >   U R                  U5      n[        R                  " 5          U R                  5          U R	                  X5      nS S S 5        WR                  5       R                  5       nS S S 5        WS S 4$ ! , (       d  f       N9= f! , (       d  f       N$= frq   )r  r   r  compute_loss_context_managerr  ri  r  )r0  rV   r  prediction_loss_onlyr  r  s         ru   prediction_stepGRPOTrainer.prediction_step  su    %%f-]]_224((7 599;%%'D  T4 54 _s"   BA=
&B=
B	B
Blogs
start_timec           	        > U R                   R                  (       a  SOSnU R                  U   R                  5        VVs0 s H  u  pEU[	        U5      [        U5      -  _M     nnnUS:X  a(  UR                  5        VVs0 s H  u  pESU 3U_M     nnn0 UEUEn[        TU ]  X5        U R                  U   R                  5         U R                  R                  (       Ga  U R                  (       Ga  [        5       (       ab  [        U R                  S   U R                  S   U R                  S   U R                  S   U R                  R                   U R"                  5        U R$                  R&                  (       Gaw  SU R$                  R&                  ;   Ga[  [(        R*                  GbH  S	S Kn[/        U R                  R                   5      /[        U R                  S   5      -  U R                  S   U R                  S   S
.U R                  S   ESU R                  S   0EnU R                  S   (       a\  / US'   U R                  S    HD  n	U	b*  US   R1                  [(        R2                  " U	5      5        M0  US   R1                  S 5        MF     UR5                  U5      n
U R6                  (       a  U
R9                  S/S9n
[(        R                  " S[(        R:                  " U
S905        g g g g g g s  snnf s  snnf )Nr   r   eval_r   r   r   r   wandbr   )stepr   r   	advantager   )subsetr  )	dataframe)rV   r?  r  r  r  r   r   logclearr   r  r  r'   rD   r  r  rE  r  rX   	report_tor  runpandasr   r   Image	DataFramer  drop_duplicatesTable)r0  r  r  r  r  valr  pdtablerw  dfr8  s              ru   r  GRPOTrainer.log  s   **--w6<@MM$<O<U<U<WX<W3C3s8++<WX 6>:A--/J/hcse}c)/GJ"$"'"D%d!!#+++0D0D0D ""/JJx(JJ|,JJy)JJ|,JJ**11 yy"""w$))2E2E'E%))J_# !!7!789C

8@T<UU"jj2"&**\": jj+	
  L!9 ::g&%'E'N#zz'2?!'N11%++c2BC!'N11$7  3 \\%(00++H:+>B		=%++*CDE/ K`'E" 1E+ Y
 Ks   $K=Lc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nra   rb   )r1  )	rX   hub_model_idr   
output_dirr  r   create_model_cardr   _save_checkpoint)r0  rV   trialr1  r8  s       ru   r  GRPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .r   r1  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr   unsloth_versionunslothJOB_IDhf_jobsa              @article{shao2024deepseekmath,
                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
                year         = 2024,
                eprint       = {arXiv:2402.03300},
            }
            GRPOzRDeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Modelsz
2402.03300)rp  r1  r  r  r	  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror   rV   r   r  pathisdirr   r  rr   r   r  r  r&  r*  textwrapdedentr=   r  r"   r  r  urlr>   savejoinrX   r  )r0  r1  r  r	  rp  citation
model_cards          ru   r  GRPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ??	
 )!!**%'9';';		@Ueiimm[_.0%l!

 	TYY%9%9;GHr   )9r   r   r#  r  r  r;  r   r  r|   r   r~   r}   r'  r"  rg   rh   r   r  r  r  r   r   r   r   r   r(  r   r  r   r   r   r   r   r   r   rW   r\   r   r   r   r   r   r   r   r  r   r   r   rj   ri   r  r   r   r   r   r   r  )NNNNNN)NNNrq   )NNNN)NFNNNN)r  N)FN)NNN)8r   
__module____qualname____firstlineno____doc__r*  r   r   r   
RewardFuncr   r
   r9   r   r   r   r   r   r!   tupler   optim	Optimizerlr_schedulerLambdaLRr   r<  rY  r   rT  rd  r.   rt  Tensorr{  r  r  r  r   r   r  r  r  r   r  r  r  r  r  r  r  r  r  r  __static_attributes____classcell__)r8  s   @ru   rR   rR   f   s   cJ J &*CGnrUYmq59jv.2NS/)*N JZ(889N z"	N
  g&> ?@N uWotCwXgOgIhDh?i%ijkN #5)@.)P#QRN $,E2I4PgKh2h,i#jN D12N (5;;#8#898EKKD\D\DeDe;ffgN l+N N`:$X<"
(7*; "
w "
H
 
  !*! *!X&*u|| &*5<< &*\a &*fkfrfr &*P  !?  
c8ELL))	*?  ? BHT#Y<O J JC J68 8 I* I*V   $S%c0A*B%B C 	c5s*++	,   D 4  4 l{4U5<<+<%= =>?{	c5s*++	,{z(?T 5 5~@ PXY]^aYbPc  /FS%Z( /Fhuo /FQU /F /Fd/ %)&*,0	CISMCI smCI CcD()	CI CIr   rR   )vr:  r   r  r=  r  collectionsr   r   
contextlibr   	functoolsr   pathlibr   typingr   r	   r
   r   rK  r   torch.utils.datar   
accelerater   accelerate.utilsr   r   r   r   r   r   r   r   torch.distributed.fsdpr   r  r   r   r   r   r   r   r   r   r   r   r    r!   r"   transformers.trainer_utilsr#   transformers.utilsr$   r%   r&   r'   
data_utilsr)   r*   r+   r,   extras.profilingr-   r.   extras.vllm_clientr/   import_utilsr0   r1   modelsr2   r3   r4   r5   models.utilsr6   r]   r8   grpo_configr9   rR  r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   peftrK   rL   liger_kernel.chunked_lossrM   vllmrN   rO   vllm.sampling_paramsrP   r  
get_loggerr   r   r   r   r{  r%  rR   r   r   ru   <module>rE     s     	 	  * "   1 1      b b -  C 0    3 u u w w E + G e e . + #    * *B(9 
		H	% 34,U2K)LLM
wI' wIr   