
    hA                       S SK r S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SK	J
r
  S SKJr  S SKJr  S SKJrJrJrJr  S SKrS SKrS SKrS SKrS SKJr  S SKJrJrJrJrJr  S S	KJ r J!r!  S S
KJ"r"  S SK#J$r%  S SKJ&r&J'r'  S SKJ(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2  S SK3J4r4  S SK5J6r6J7r7J8r8J9r9  SSK:J;r;J<r<J=r=  SSK>J?r?J@r@  SSKAJBrB  SSKCJDrD  SSKEJFrFJGrGJHrHJIrI  SSKJJKrK  SSKLJMrM  SSKNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_  \8" 5       (       a  S SK`JaraJbrb  \D" 5       (       a  S SKcJdrdJere  S SKfJgrg  \2" 5       (       a  S SKhrh\R                  " \j5      rk\\l\-\\m\m/\m\n   4   4   ro " S S\05      rpg)    N)defaultdictdeque)nullcontext)partial)Path)AnyCallableOptionalUnion)logging)broadcast_object_listgathergather_objectis_peft_modelset_seed)DatasetIterableDataset)nn)FullyShardedDataParallel)
DataLoaderSampler)
AutoConfig"AutoModelForSequenceClassificationAutoProcessorAutoTokenizerGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_wandb_available)seed_worker)is_datasets_availableis_flash_attn_2_availableis_peft_availableis_rich_available   )apply_chat_templateis_conversationalmaybe_apply_chat_template)profiling_contextprofiling_decorator)
VLLMClient)is_vllm_available)prepare_deepspeedprepare_fsdpprepare_peft_modelunwrap_model_for_generation   )SyncRefModelCallback)
RLOOConfig)RepeatSamplerdisable_dropout_in_modelentropy_from_logitsgenerate_model_cardget_comet_experiment_urlidentitynanmaxnanminnanstdpadprint_prompt_completions_sampleselective_log_softmaxshuffle_sequence_dictsplit_pixel_values_by_gridsplit_tensor_dicttruncate_with_protected_tokensunsplit_pixel_values_by_grid)
PeftConfig	PeftModel)LLMSamplingParams)GuidedDecodingParamsc                      ^  \ rS rSrSrSS/r               S2S\\\4   S\\	\
\	   4   S\\   S	\\\\4      S
\\\\\\\\\4   4   4      S\\\\4      S\\\\
\   4      S\\
\      S\\\R*                  R,                     \\R*                  R.                  R0                     4   S\S   4U 4S jjjrS rS rS3S\\   S\4S jjrS\4S jr\  S4S\\\\R@                     4   4S jj5       r!S3S\\
\      4S jjr"S5S\#RH                  S\4S jjr%S\#RH                  4S jr&\S 5       r'\S \\\\R@                  \(4   4   S\\\\R@                  \(4   4   4S! j5       r)\U 4S" j5       r*S#\
\\\\R@                  \(4   4      S\\\\R@                  \(4   4   4U 4S$ jjr+\S6S% j5       r,S& r-S3S'\\
\      4S( jjr.S3S)\\\/4   S*\\/   SS4U 4S+ jjjr0U 4S, jr1   S7S-\\   S.\\   S/\\\
\   S4   4S0 jjr2S1r3U =r4$ )8RLOOTrainerd   a  
Trainer for the Reinforce Leave One Out (RLOO) method. This algorithm was initially proposed in the paper [Back to
Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs]
(https://huggingface.co/papers/2402.14740).

Example:

```python
from datasets import load_dataset
from trl import RLOOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")


def reward_func(completions, **kwargs):
    # Dummy reward function that rewards completions with more unique letters.
    return [float(len(set(completion))) for completion in completions]


trainer = RLOOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=reward_func,
    train_dataset=dataset,
)

trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function, such as:
            - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
            path to a *directory* containing model weights saved using
            [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
            using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
            keyword arguments in `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
            - A custom reward function: The function is provided with the prompts and the generated completions,
              plus any additional columns in the dataset. It should return a list of rewards. Custom reward
              functions can also return `None` when the reward is not applicable to those samples. This is useful
              for multi-task training where different reward functions apply to different types of samples. When a
              reward function returns `None` for a sample, that reward function is excluded from the reward
              calculation for that sample. For more details, see [Using a custom reward
              function](#using-a-custom-reward-function).

              The trainer's state is also passed to the reward function. The trainer's state is an instance of
              [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
              reward function's signature.
        - A list of reward functions, where each item can independently be any of the above types. Mixing different
        types within the list (e.g., a string model ID and a custom reward function) is allowed.
    args ([`RLOOConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
        ignored. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. The padding side must be set to "left". If `None`, the
        processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
        padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
        `tokenizer.eos_token` will be used as the default.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
        If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
        `None`, the tokenizer for the model is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
        functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
        are ignored.
    callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
        model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
trlrlooNmodelreward_funcsargstrain_dataseteval_datasetprocessing_classreward_processing_classes	callbacks
optimizerspeft_configrH   c                 D"  >^ Ub'  [         R                  " S5        Tc  UmO[        S5      eUb'  [         R                  " S5        Uc  UnO[        S5      eUb'  [         R                  " S5        Uc  UnO[        S5      eUb  [         R                  " S5        Ub  [         R                  " S5        S	UR                  ;   a+  [         R                  " S
5        S nUR	                  USU0S9nUb;  S	UR                  ;   a+  [         R                  " S5        S nUR	                  USU0S9nTcO  [        U[        5      (       a  UOUR                  R                  nUR                  S5      S   n[        U S35      mTR                  =(       d    0 n[        U[        5      (       a  UnUR                  S5      n[        U[        R                  5      (       d	  US:X  d  Uc  O:[        U[        5      (       a  [        [        U5      nUUS'   O[        SU S35      e[         R"                  " U5      n[        [$        UR&                  S   5      nUR"                  " U40 UD6nO8UR                  R                  nTR                  b  [(        R+                  S5        [-        US5      (       d8  [.        R0                  " UR2                  5      R4                  R7                  5       OE[.        R0                  " UR9                  5       R2                  5      R4                  R7                  5       U l        U
c$  [=        5       (       a!  [        U[>        5      (       a  [A        XT5      nUc*  [B        R"                  " UR                  R                  5      n[        U[D        5      (       a  URF                  nO#[        U[H        5      (       a  UnO[K        S5      eURL                  c  URN                  Ul&        URL                  U l&        URP                  U l(        URR                  U l)        [        U[T        5      (       d  U/n/ U l+        [Y        U5       H  u  nn[        U[        5      (       a  [Z        R"                  " U4SS0UD6UU'   [        UU   [\        R^                  5      (       aF  U RV                  Ra                  UU   R                  R                  R                  S5      S   5        M  U RV                  Ra                  UU   Rb                  5        M     X l2        TRf                  b  [i        TRf                  5      [i        U5      :w  a.  [        S[i        TRf                  5       S[i        U5       S35      e[        Rj                  " TRf                  [        Rl                  S 9U l3        O1[        Rn                  " [i        U5      [        Rl                  S 9U l3        Uc  S /[i        U5      -  nO[        U[T        5      (       d  U/n[i        U5      [i        U5      :w  a$  [        S![i        U5       S"[i        U5       S#35      e[Y        [q        Xr5      5       H  u  nu  nn[        U[r        5      (       d  M   Uc*  [t        R"                  " UR                  R                  5      nURP                  c  URN                  Ul&        URP                  UR                  l(        UUU'   M     Xpl;        TRx                  U l<        TRz                  U l=        TR|                  U l>        TR~                  U l?        TR                  U l@        TR                  U lA        TR                  U lB        TR                  U lC        TR                  U lD        TR                  U lE        TR                  U lF        TR                  U lG        TR                  U lH        TR                  U lI        TR                  U lJ        TR                  U lK        TR                  U lL        [        U[        5      (       dO  [        U[        5      (       d:  [        U[        5      (       a0  [        S$ UR                  5        5       5      (       a  [        S%5      eTR                  U lR        TR                  U lT        TR                  b  TR                  OTR                  U lU        SU lV        S U lW        S&UR                  S''   [        TU G]i  UT[        UUUUU	S(9  TR                  U l\        U R                  S):X  a  S U l]        Oc[        U5      (       a  S U l]        OK[         R"                  " U5      n[        [$        UR&                  S   5      nUR"                  " U40 UD6U l]        TR                  (       a-  [        U5        U R                  b  [        U R                  5        [        [T        5      [        [T        5      S*.U lb        SU lc        TR                  U ld        TR                  U le        TR                  U lf        [        TR                  S+9[        TR                  S+9[        U4S, j5      [        TR                  S+9S-.U li        [        TR                  S&S.9  U R                  (       Ga  [        5       (       d  [        S/5      eU R                  S0:X  a  U R                  R                  (       a  TR                  b  TR                  nOS1TR                   S2TR                   3n[        UTR                  S39U lu        U R                  R                  [        R                  R                  5       S49  GOU R                  S5:X  Ga  U R                  R                  U R                  -  S:X  d0  [        S6U R                   S7U R                  R                   S835      eU R                  S:  a  [        R                  R                  [        U R                  R                  U R                  -  5       Vs/ s H5  n[U        [        UU R                  -  US-   U R                  -  5      5      PM7     sn5      u  U l}        n[        U R                  R                  5      [        GR                   S9'   [        U R                  GR                  5      [        GR                   S:'   [        U R                  R                  5      [        GR                   S;'   [        GR                   R                  S<S=5      [        GR                   S<'   [        GR                   R                  S>S?5      [        GR                   S>'   U Rx                  b'  U Rz                  b  U Rx                  U Rz                  -   nOS nG[        UGR                  TR                  U R                  U GR                  GR
                  U R                  -  U GR                  GR                  -  US@U R                  R                  U R                  -  SAU GR                  GR                  SB9	U l        O[        SCU R                   SD35      eTGR                  U l        SU l        U R                  GR                  5         OU Rz                  S&URP                  UGR                  URR                  U R~                  U R                  U R                  U R                  U R                  TGR                  SE.nTR                  (       a  SFUSG'   SHUSI'   SJUSK'   TGR                  b  UGR!                  TGR                  5        G[#        SP0 UD6U l        SLU l        U GR(                  GR+                  U GR,                  5        U R                  b  U GR.                  (       a'  G[1        U R                  U R                  5      U l]        OcU GR2                  (       a'  G[5        U R                  U R                  5      U l]        O*U R                  GR7                  U R                  S&SM9U l]        TGR8                  (       a/  U GR;                  G[=        U R                  U R                  SN95        [Y        U Rd                  5       H~  u  nn[        U[r        5      (       d  M  U GR.                  (       a&  G[1        UU R                  5      U Rd                  U'   MU  U R                  GR7                  US&S&SO9U Rd                  U'   M     g s  snf )QNz}Parameter 'config' is deprecated and will be removed in version 0.25.0. Please use 'args' instead. We are setting args=configzMCannot specify both 'config' (deprecated) and 'args'. Please use 'args' only.zParameter 'reward_model' is deprecated and will be removed in version 0.25.0. Please use 'reward_funcs' instead. We are setting reward_funcs=reward_modelzcCannot specify both 'reward_model' (deprecated) and 'reward_funcs'. Please use 'reward_funcs' only.zParameter 'policy' is deprecated and will be removed in version 0.25.0. Please use 'model' instead. We are setting model=policyzOCannot specify both 'policy' (deprecated) and 'model'. Please use 'model' only.zParameter 'ref_policy' is deprecated and will be removed in version 0.25.0. To use the initial model as the reference model, simply omit this parameter. The parameter is ignored.zParameter 'data_collator' is deprecated and will be removed in version 0.25.0. The RLOOTrainer does not use a data collator, so this parameter is ignored.	input_idsa  The training dataset contains a column named 'input_ids', indicating that it is pre-tokenized. Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide the raw dataset (conversational or standard) with a 'prompt' column instead.c                 .    SUR                  U S   5      0$ Npromptr]   decodeexample	tokenizers     R/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/rloo_trainer.pyrb   $RLOOTrainer.__init__.<locals>.decode       )"2"27;3G"HII    re   )	fn_kwargsa  The evaluation dataset contains a column named 'input_ids', indicating that it is pre-tokenized. Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide the raw dataset (conversational or standard) with a 'prompt' column instead.c                 .    SUR                  U S   5      0$ r_   ra   rc   s     rf   rb   rg     rh   ri   /z-RLOOdtypeautozInvalid `dtype` passed to `RLOOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .r   zYou passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.get_base_modelzWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`
num_labelsr4   zNumber of reward weights (z)) must match number of reward functions ())rn   z)The number of reward processing classes (z-) must match the number of reward functions (z).c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)
isinstancer   ).0dss     rf   	<genexpr>'RLOOTrainer.__init__.<locals>.<genexpr>  s     6wav[]z"o7V7Vavs   z^Iterable datasets are not yet supported in RLOOTrainer. Please use a standard dataset instead.Testimate_tokens)rR   rT   data_collatorrU   rV   rW   rY   rZ           )trainevalmaxlenc                  *   > [        T R                  S9$ )Nr   )r   generation_batch_size)rT   s   rf   <lambda>&RLOOTrainer.__init__.<locals>.<lambda>  s    58R8R+Sri   )r`   
completionrewards
advantages)device_specificzkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutdevicecolocatezvllm_tensor_parallel_size (z) must divide world size (z	) evenly.RANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345external_launcheri   )	rR   tensor_parallel_sizegpu_memory_utilizationmax_num_seqsmax_model_lendistributed_executor_backendseedmax_num_batched_tokens
model_implz6vllm_mode must be either 'server' or 'colocate', got 'z'.)max_new_tokens	do_samplepad_token_idbos_token_ideos_token_idtemperaturetop_ptop_kmin_prepetition_penaltycache_implementationi   max_batch_tokensi   
num_blocks   
block_sizeF)evaluation_mode)	ref_modelaccelerator)r   device_placement )warningswarn
ValueErrorcolumn_namesmaprv   strconfig_name_or_pathsplitr6   model_init_kwargsgettorchrn   getattrr   from_pretrainedtransformersarchitecturesloggerwarninghasattrinspect	signatureforward
parameterskeysrq   model_kwarg_keysr&   rI   r2   r   r   re   r   	TypeError	pad_token	eos_tokenr   r   listreward_func_names	enumerater   r   Moduleappend__name__rS   reward_weightslentensorfloat32oneszipr   r   rX   max_prompt_lengthmax_completion_lengthnum_generationsr   r   r   r   r   use_transformers_pageduse_vllm	vllm_modevllm_gpu_memory_utilizationvllm_tensor_parallel_sizenormalize_advantagesmask_truncated_completionsreward_clip_rangeshuffle_datasetr   dictanyvaluesNotImplementedErrornum_iterationsepsilonepsilon_lowepsilon_high_step_buffered_inputswarnings_issuedsuper__init__r<   betar   r   disable_dropoutr8   r   _metrics_total_train_tokenslog_completionswandb_log_unique_promptsnum_completions_to_printr   r   _logsr   r   r/   ImportErrorr   is_main_processvllm_server_base_urlvllm_server_hostvllm_server_portr.   vllm_server_timeoutvllm_clientinit_communicatorcudacurrent_devicenum_processesdistributednew_subgroups_by_enumerationrangetp_groupprocess_indexosenvironlocal_process_indexrJ   name_or_pathrT   per_device_train_batch_sizesteps_per_generationvllm_model_implllmvllm_guided_decoding_regexguided_decoding_regex_last_loaded_stepwait_for_everyoner   r   generation_kwargsupdater   generation_configmodel_accepts_loss_kwargsrR   add_model_tags
_tag_namesis_deepspeed_enabledr0   is_fsdp_enabledr1   prepare_modelsync_ref_modeladd_callbackr5   )selfrR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r   reward_modelpolicy
ref_policyr|   rb   
model_namer   model_idrn   architecturere   ireward_funcreward_processing_classr   _r   r  	__class__s      `                          rf   r   RLOOTrainer.__init__   s   * MM- | !pqq#MMS #+   MM. } !rss!MM` $MMI -444MM_J *--fN^@_-`M#|7P7P(PMM_J (++F{L\>]+^L <",UC"8"8ell>X>XJ#))#.r2JE23D !228beS!!H%))'2E%--&EME3''u--2!'* BBGK 
  //9F"<1E1Ea1HIL 00O=NOE||11H%%1? 5"233 emm,77<<>""5#7#7#9#A#ABMMRRT 	 "'8':':z%QZ?[?[&u4@E #,<<U\\=W=WX &77(22I(*ABB(Iuvv&"+"5"5I",,%22%22 ,--(>L!#'5NA{+s++"D"T"T#,-#1B#Q ,q/29955&&--l1o.D.D.R.R.X.XY\.]^`.ab&&--l1o.F.FG 6 ) *4&&'3|+<< 0T5H5H1I0J K""%l"3!4A7  #(,,t/B/B%--"XD"'**S->emm"TD %,)-\1B(B%5t<<)B(C%()S->>;C@Y<Z;[ \%%(%6$7r; 
 :C3G`Co9p5A5'+77*2.;.K.KKL^L^LlLl.m+*77?8O8Y8Y+5 3J2V2V""//F)!, :q *C& "&!7!7%)%?%?"#33++ZZ
ZZ
ZZ
"&"9"9&*&A&A#+/+K+K()-)G)G&$($=$=!*.*I*I'!%!7!7  $33 }o66,88<..36wamatatav6w3w3w &p 
 #11<<151B1B1ND--TXT`T`
 !% 48/0"'%-! 	 		
 II	99!DN5!! "DN  //9F"<1E1Ea1HIL)99(XFWXDN $U+~~)(8 #.d"3[=NO#$ #33(,(E(E%(,(E(E% 4#=#=>t'A'AB"#STt'A'AB	

 	D1===$&&!4 
 ~~)##3300<#'#<#<%,T-B-B,C1TEZEZD[#\'18X\XpXp'qD$$$66ejj>W>W>Y6Z:- ''558V8VVZ[[$5d6T6T5U V ,,::;9F 
 11A5 (-'8'8'U'U &+4+;+;+I+ITMkMk+k%l%l !q4+I+I'IAPQEUYUsUsKs!tu%l($DM1 &))9)9)G)G%H

6"+.t/?/?/S/S+T

<(+.t/?/?/M/M+N

<(,.JJNN=+,V

=),.JJNN=',R

=)))5$:T:T:`$($:$:T=W=W$WM$(M,,)-)G)G+/+K+K!%!F!F44"5ii44"5 #01D))774;Y;YY+/#yy88  !#YZ^ZhZhYiik!lmm *.)H)HD&%'D"
 ..0 #'"<"<! ) 6 6 ) 6 6 ) 6 6#//&*&=&=(,(A(A! **8;!"4526!,/25!,/%%1!(()?)?@%5%J8I%JD"
 */& 	

!!$//2>>%(((!24>>4CSCS!T%%%!-dnnd>N>N!O!%!1!1!?!?`d!?!e2T^^Y]YiYijk'(9(9:NA{+77,,,+<[$JZJZ+[D%%a( ,0+;+;+I+I#TD ,J ,D%%a( ;ks   )<ADc                 0    U R                   c	  S/U l         g g )Nr`   )_signature_columns)r%  s    rf    _set_signature_columns_if_needed,RLOOTrainer._set_signature_columns_if_neededu  s    
 ""*'/jD# +ri   c                 x   U R                   c  [        S5      eU R                   nU R                  n[        5       (       a0  [	        U[
        R                  5      (       a  U R                  USS9nOU R                  USS9nU R                  U R                  R                  -  UU R                  R                  U R                  R                  U R                  R                  S.n[	        U[        R                   R"                  R$                  5      (       d  U R'                  5       US'   U R                  R(                  US'   [+        [,        U R                  R                  U R                  R.                  S9US'   U R                  R0                  US	'   U R2                  R5                  [7        U40 UD65      $ )
Nz+Trainer: training requires a train_dataset.training)description)
batch_size
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_last)r;  rankworker_init_fnprefetch_factor)rU   r   r|   r$   rv   datasetsr   _remove_unused_columns"_get_collator_with_removed_columns_train_batch_sizerT   r  dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersr   utilsdatar   _get_train_samplerdataloader_drop_lastr   r#   r  dataloader_prefetch_factorr   preparer   )r%  rU   r|   dataloader_paramss       rf   get_train_dataloader RLOOTrainer.get_train_dataloader  st   %JKK**** ""z-AQAQ'R'R 77S]7^M CCM_iCjM 004993Q3QQ'99;;))99"&))"I"I
 -)9)9)I)IJJ+/+B+B+Di(-1YY-K-Kk*29)I)IPTPYPYPgPg3./ 48993W3W/0''
=(VDU(VWWri   datasetreturnc           	         Uc  U R                   n[        UU R                  U R                  R                  U R                  -  U R
                  U R                  R                  -  U R                  U R                  R                  S9$ )N)data_sourcemini_repeat_countr9  repeat_countshuffler   )	rU   r7   r   rT   r   r   r  r   r   )r%  rS  s     rf   rL  RLOOTrainer._get_train_sampler  sq    2 ?((G"22yy66$:N:NN,,tyy/M/MM((
 	
ri   c                 T    [        UU R                  U R                  R                  S9$ )N)rV  rW  r   )r7   r   rT   r   )r%  rV   s     rf   _get_eval_samplerRLOOTrainer._get_eval_sampler  s&    $"22
 	
ri   c                    U=(       d    UR                  S5      n/ n/ n[        SUR                  S5      U5       H  n	X)X-    n
X9X-    nXS.nSU R                  ;   a  US-   US'   SUS'   U" S
0 UD6R                  nUSS2SS2SS24   nUSS2U* S2SS24   nXR                  -  nU
SS2U* S24   n[        X5      nUR                  U5        U(       d  M  [        R                  " 5          [        U5      nSSS5        UR                  W5        M     [        R                  " USS	9nU(       a  [        R                  " USS	9OSnUU4$ ! , (       d  f       NZ= f)z<Compute log-probs and (optionally) entropies for each token.r   )r]   attention_masklogits_to_keepr4   F	use_cacheNrm   dimr   )sizer  r   logitsr   rB   r   r   no_gradr9   cat)r%  rR   r]   r_  r`  r9  compute_entropy	all_logpsall_entropiesstartinput_ids_batchattention_mask_batchmodel_inputsre  completion_idslogps	entropiess                    rf   "_get_per_token_logps_and_entropies.RLOOTrainer._get_per_token_logps_and_entropies  sj     49>>!#4
	1innQ/<E'0BCO#1%:L#M  *9aL  4#8#881?!1C-.(-L%*\*11FAssAI&FA/23F ...F,Q0@-@AN)&AEU#]]_ 3F ;I %$$Y/= =@ 		)+7FEIIm3D	i %_s   .E
E	extra_prefixesc                 ^    U=(       d    / nS/U-   nU H  nUR                  US5      nM     U$ )Nz_checkpoint_wrapped_module. )replace)r%  namert  prefixesprefixs        rf   _fix_param_name_to_vllm#RLOOTrainer._fix_param_name_to_vllm  s8    '-212^CF<<+D ri   modulerz  c                 @   Uc
  [        5       nUR                  5        H%  u  pEU(       a  U SU 3OUnU R                  XVUS9  M'     [        U[        5      (       Ga1  [        R
                  " USSS9   UR                  5        H  u  pxU(       a  U SU 3OUn	U R                  U	S/S9n	X;   a  M-  UR                  U	5        U R                  S:X  aB  U R                  R                  (       a'  U R                  R                  XR                  5        M  U R                  S	:X  d  M  U R                  R                   R"                  R$                  R&                  R(                  n
U
R+                  XR                  4/5        M     SSS5        gg! , (       d  f       g= f)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nrp   )rz  visitedF)recurse	writebackz_fsdp_wrapped_module.rt  r   r   )setnamed_children_sync_fsdp1_params_to_vllmrv   FSDPsummon_full_paramsnamed_parametersr{  addr   r   r   r  update_named_paramrK  r  
llm_enginemodel_executordriver_workermodel_runnerrR   load_weights)r%  r}  rz  r  
child_namechild_modulechild_prefix
param_nameparam	full_name	llm_models              rf   r  &RLOOTrainer._sync_fsdp1_params_to_vllm
  s_    ?eG(.(=(=(?$J7=fXQzl3:L++7 ,  )@ fd##((%P)/)@)@)B%J<B6(!J< 8
I $ < <YXoWp < qI + KK	*~~1d6F6F6V6V((;;IzzR:5$(HH$7$7$F$F$T$T$a$a$g$g	!..JJ0G/HI *C QP $PPs   3B/F&AF
Fc                 4   UR                  5       R                  5        H  u  p#UR                  (       a%  UR                  [        R
                  " S5      5      nUR                  5       nU R                  S:X  a8  U R                  R                  (       a  U R                  R                  X#5        M  U R                  S:X  d  M  U R                  R                  R                  R                  R                   R"                  nUR%                  X#4/5        M     g )Nr  r   r   )
state_dictitemsis_cputor   r   full_tensorr   r   r   r  r  r  r  r  r  r  rR   r  )r%  r}  rx  r  r  s        rf   _sync_fsdp2_params_to_vllm&RLOOTrainer._sync_fsdp2_params_to_vllm%  s    !,,.446KD||f!56%%'E~~)d.>.>.N.N  33D@:- HH//>>LLYY__	&&7 7ri   c                 	   U R                   R                  R                  nUS L=(       a    UR                  S:H  nU(       a  SS KnUR
                  R                  nO[        n[        U R                  5      (       Ga0  U" [        U R                  R                  5       5      5         U R                  R                  5         U R                  (       a}  [        U R                   R                  SS 5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO]US:X  a  U R!                  U R                  5        GO:U R                  R#                  5        GH  u  pxUR%                  S5      R'                  SS	5      nU R                  R(                  U;   a  MC  S
U;   a  MK  U R+                  US/S9nU R,                  S:X  aB  U R                   R.                  (       a'  U R0                  R3                  XxR4                  5        M  U R,                  S:X  d  M  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        GM     U R                  RC                  5         S S S 5        GOU R                  (       a}  [        U R                   R                  SS 5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO%US:X  a  U R!                  U R                  5        GOU R                  R#                  5        H  u  pxU R+                  U5      nU" U/5         U R,                  S:X  aA  U R                   R.                  (       a&  U R0                  R3                  XxR4                  5        OkU R,                  S:X  a[  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        S S S 5        M     U R,                  S:X  a6  U R                   R.                  (       a  U R0                  RE                  5         g U R,                  S:X  a  U R6                  RE                  5         g g ! , (       d  f       N= f! , (       d  f       GM{  = f)N   r   fsdp_pluginfsdp_versionr4   r(   zbase_model.model.z.base_layerrv  original_modulezmodules_to_save.default.r  r   r   )#r   statedeepspeed_plugin
zero_stage	deepspeedzeroGatheredParametersr   r   rR   r   r   merge_adapterr!  r   r  r  r  removeprefixrw  rz  r{  r   r   r  r  rK  r  r  r  r  r  r  unmerge_adapterreset_prefix_cache)
r%  r  zero_stage_3r  gather_if_zero3r  r  rx  r  r  s
             rf   _move_model_to_vllmRLOOTrainer._move_model_to_vllm2  s     ++11BB't3X8H8S8SWX8X'nn??O)O$$ !djj&;&;&=!>?

((* '' #*$*:*:*@*@-QU"VKNY7;#J_`L#q(77 JJ &*77

C (,zz'B'B'D#001DEMMm]_`::,,4$,4$#;;DRlQm;n>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH (E  

**,C @?J ##%d&6&6&<&<mTRJUw{NAF[\1$33DJJ?!Q&33DJJ?#'::#>#>#@KD77=D(%1>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH 21 $A >>X%$*:*:*J*J//1^^z)HH'') *s @?^ 21s!   'E?R0*A:R05B=S0
R>
S	generation_batchc                 ^   U R                   R                  (       a  SOSnUS:X  a  U R                  R                  U R                  -  nU R
                  U-  S:X  d  U R                  cg  U R                  U5      n[        U5      n[        U5      n[        XR                  R                  5      nU Vs/ s H  n[        U5      PM     snU l        U R                  U R
                  U R                  R                  -     nU =R
                  S-  sl        U$ U R                  U5      nU$ s  snf )Nr~   r   r   r4   )rR   r7  rT   r  r   r   r   _generate_and_score_completionsrD   rC   rE   rG   )r%  r  modegenerate_everygeneration_batchesbatchinputss          rf   _prepare_inputsRLOOTrainer._prepare_inputs~  s	   " **--w67?!YY;;d>Q>QQNzzN*a/43H3H3P#'#G#GHX#Y #=>N#O #89I#J %67GIgIg%h"Zl(mZlQV)Ee)LZl(m%**4::		8V8V+VWFJJ!OJ
  99:JKF )ns   3D*c           
        > U R                   R                  n[        R                  " [	        U5      [	        U R
                  5      US9nUS    Vs/ s H  owS;  d  M
  UPM     nnU VV	s0 s H  owU V	s/ s H  oU   PM	     sn	_M     n
nn	U R                  U
S'   [        [        U R
                  U R                  U R                  5      5       GHk  u  nu  pn[        X5         [        U[        R                  5      (       a  [        US   5      (       aE  [        X#5       VVs/ s H  u  nnSUU-   0PM     nnnU Vs/ s H  n[!        UU5      S   PM     nnO#[        X#5       VVs/ s H  u  nnUU-   PM     nnnU" USSS	S
S9n["        TU ]I  U5      n[        R&                  " 5          U" S0 UD6R(                  S S 2S4   US S 2U4'   S S S 5        O[U" SX#US.U
D6nU Vs/ s H  nUb  UO[        R*                  PM     nn[        R,                  " U[        R.                  US9US S 2U4'   S S S 5        GMn     [        R0                  " U5      R3                  SS9R5                  5       (       a  [        R0                  " U5      R3                  SS9R7                  SS9S   S   nU
R9                  5        VVs0 s H  u  nnUS:w  d  M  UUU   _M     nnnUU   US'   UU   US'   [:        R=                  SU S35        [?        U5      nU$ s  snf s  sn	f s  sn	nf s  snnf s  snf s  snnf ! , (       d  f       GN= fs  snf ! , (       d  f       GM  = fs  snnf )Nr   r   )r`   r   ro  trainer_statemessagestextptTrightFr  return_tensorspaddingpadding_sideadd_special_tokens)promptscompletionsro  rn   r   r4   rb  )as_tupler`   r   z=All reward functions returned None for the following kwargs:
zH
Please ensure that at least one reward function returns a valid reward.r   ) r   r   r   zerosr   rS   r  r   r   rX   r   r,   rv   r   r   r*   r)   r   r  inference_modere  nanr   r   isnanallr   nonzeror  r   r   r   )r%  r  r  r  completion_ids_listr   rewards_per_funckeyr   rd   reward_kwargsr,  r-  r.  reward_func_namepcr  xtextsreward_inputsoutput_reward_funcrewardnan_row_idxvaluerow_reward_kwargsr0  s                             rf   _calculate_rewardsRLOOTrainer._calculate_rewards  so   !!(( ;;s7|S9J9J5KTZ[  &ayby7a,aybNRSds6B6s|6BBdS *.o&KT!!4#A#A4CYCYZL
GAG6F #4:k29955(33DGD]#^D]DAqZQ$7D]#^bj kbj]^!4Q8O!PQW!Xbj k36w3L M3L41aQ3L M$;"4T[pu%M %*G$;M$JM--/1<1M}1M1T1TUVXYUY1Z(A. 0/ *5 * 'Qd*hu*& ew)wdvZ`F4F&EII*Udv&)w-2\\:LTYTaTajp-q$QT*) ;:L
4 ;;'(,,,37799++&67;;;BJJTXJYZ[\]^_K:G:M:M:O!:OJCSVZiSi'U;'':O  ! +2+*>h'.9+.Fl+NNPQbPc dZ Z ""23_ cBS $_ k M
 0/ *x% ;:2!s   	LL+
L5LL%AM&L#
:ML)M+L.
=1M.!L4M'M-M?M
ML#M4
M>M
M	r  c                 +  > U R                   R                  nU R                  R                  (       a  SOSnU Vs/ s H  oDS   PM	     nn[        R
                  " U5      nU Vs/ s H  n[        XpR                  5      S   PM     nnU R                  USSSSS9n	[        TOU ]%  U	5      n	U	S	   U	S
   pU R                  b|  [        XU R                  / S9u  pU R                  R                  U
SSS9nU Vs/ s H=  n[        R                  " S[        R                  " U R                   5       S3SU5      PM?     nnU R"                  (       Ga3  U R$                  R&                  U R(                  :w  a+  U R+                  5         U R$                  R&                  U l        U R,                  S:X  Gaa  [/        U5      nU R                   R0                  (       a  US S U R2                  2   n[5        U S5         U R6                  R9                  UU R2                  U R:                  U R<                  U R>                  U R@                  c  SOU R@                  U RB                  c  SOU RB                  U RD                  U RF                  U RH                  RJ                  S9
nS S S 5        OS /[M        U5      -  n[O        WSS9n[Q        U R                   RR                  [M        U5      -  U R                   RR                  S-   [M        U5      -  5      nUU   nGO"U R,                  S:X  Ga  U RF                  (       a  [U        U RF                  S9nOS nSU R:                  U R<                  U R>                  U R@                  c  SOU R@                  U RB                  c  SOU RB                  U RD                  US.nU RH                  RJ                  b%  URW                  U RH                  RJ                  5        [Y        SJ0 UD6nU RZ                  S:  aw  [M        U5      n[]        U RZ                  5       Vs/ s H  nS PM     nn[^        R`                  Rc                  UXRd                  S9  U VVs/ s H  nU  H  nUPM     M     nnnOUnUn[5        U S5         U Rf                  R9                  UUSS9nS S S 5        W VVs/ s H#  nURh                    H  nURj                  PM     M%     nnnU RZ                  S:  aA  [^        R`                  Rm                  U Rd                  S9n[Q        UW-  US-   U-  5      nUU   nW Vs/ s H  n[^        Rn                  " UUS9PM     nn[q        XRr                  S9n[^        Rt                  " X/SS9n GOU Rv                  (       Ga  U R                  US 9n!U Rx                  Rz                  R|                  n"[        5       (       a  S!U Rx                  Rz                  l>        OS"U Rx                  Rz                  l>        [5        U S#5         [        U Rx                  U R                   U RH                  R                  S$9 n#[^        R                  " 5          U R                  (       a  [        R                  " U Rx                  SS%9O	[        5          U RH                  R                  (       a   U#R                  [^        R                  5        O:U RH                  R                  (       a  U#R                  [^        R                  5        [^        R                  " 5          U#R                  U!R                  U R                  SS&9nS S S 5        S S S 5        S S S 5        S S S 5        S S S 5        WR                  5        Vs/ s H  nUR                  PM     nnU Vs/ s H  n[^        Rn                  " UUS9PM     nn[q        XRr                  S'S(9nU!R                   Vs/ s H  n[^        Rn                  " UUS9PM     n
n[q        XRr                  SS(9n
[^        Rt                  " X/SS9n U"U Rx                  Rz                  l>        GO[5        U S)5         [        U Rx                  U R                   U RH                  R                  S$9 n#[^        R                  " 5          U R                  (       a  [        R                  " U Rx                  SS%9O	[        5          XsU	S	'   U	S
'   U#R8                  " SJ0 U	DU R                  SS*.D6n S S S 5        S S S 5        S S S 5        S S S 5        U
R                  S5      n$W S S 2S U$24   n
U S S 2U$S 24   nXR                  :H  n%[^        R                  " U%R                  S5      4U%R                  S5      [^        R                  US+9n&U%R                  5       R                  SS9U%R                  SS9   U&U%R                  SS9'   [^        R                  " U%R                  S5      US9R                  U%R                  S5      S5      n'U'U&R                  S5      :*  R                  5       n([        UU(R                  5       5       V)V*s/ s H  u  n)n*U)U*   R                  5       PM     n+n)n*U(R                  S5      n,U R                  (       a3  U%R                  SS9) n-U(U-) R                  S5      R                  5       -  n([^        Rt                  " UU(/SS9n.UR                  S5      n/US:X  a  U RH                  R                  OU RH                  R                  n0[^        R                  " 5          U R                  U R                  U U.U/U05      u  n1nU1U(-  R                  S5      n2U R                  S:w  a  U R                  b!  U R                  U R                  U U.U/U0S,9u  n3nO_U R                   R                  U R                  5      R                  5          U R                  U R                  U U.U/U0S,9u  n3nS S S 5        OS n3S S S 5        U R                  R                  USS-9n4[        US   5      (       aR  / n5[        UU45       H?  u  n6n7U6S   S.   S/:X  a  U6R                  5       S0   OSn8U5R                  S/U8U7-   S1./5        MA     OU4n5U R                  XU5U+5      n9U9U R                  R                  U5      R                  S5      -  R                  SS9n:U R                  (       a*  U:R                  U R                  S   U R                  S   S29n:U R                  S:w  a6  W1W3-
  n;U;U(-  R                  S5      n<[        U<5      n<U:U R                  U<-  -
  n:U:R                  SU R2                  5      n=U=R                  SS9n>U=R                  SS9n?[^        R                  " U?[^        R                  " U?5      5      n@U=R                  SSS39nAUAU=-
  U R2                  S-
  -  nBUBR                  S5      nBU:UB-
  nCU R                  (       a'  WCUCR                  5       -
  UCR                  5       S4-   -  nC[Q        U R                   RR                  [M        U5      -  U R                   RR                  S-   [M        U5      -  5      nWCR                  5       nDUCU   nCUS:X  ab  U R$                  =R                  U R                   R                  U.R                  5       5      R                  5       R                  5       -  slx        U R$                  R                  /U R                  U   S5'   U R                  S:w  a  W;U(-  R                  5       U(R                  5       R                  S6S79-  nEU R                  U   S8   R                  U R                   R                  UE5      R                  5       R                  5       5        U R                   R                  U,5      nFU R                  U   S9   R                  UFR                  5       R                  5       R                  5       5        U R                  U   S:   R                  UFR                  5       R                  5       R                  5       5        U R                  U   S;   R                  UFR                  5       R                  5       R                  5       5        U R                   R                  U%R                  SS95      nGUFUG   nHS[M        UH5      [M        UF5      -  -
  nIU R                  U   S<   R                  UI5        [M        UH5      S:X  a  [^        R                  " SUS9nHU R                  U   S=   R                  WHR                  5       R                  5       R                  5       5        U R                  U   S>   R                  UHR                  5       R                  5       R                  5       5        U R                  U   S?   R                  UHR                  5       R                  5       R                  5       5        G[        U GR                  5       H  u  nJnK[^        R                  " U9S S 2UJ4   5      R                  5       nLU R                  U   S@UK SA3   R                  UL5        G[        U9S S 2UJ4   5      R                  5       nMU R                  U   S@UK SB3   R                  UM5        M     U R                  U   SC   R                  U>R                  5       R                  5       5        U R                  U   SD   R                  U?R                  5       R                  5       5        U R                  U   SE   R                  W@R                  5       R                  5       R                  5       5        U GR                  S   GR	                  [/        U5      5        U GR                  SF   GR	                  [/        U45      5        G[        U GR                  5       H>  u  nJnNU GR                  SG   UN   GR	                  U9S S 2UJ4   R                  5       5        M@     U GR                  SH   GR	                  WDR                  5       5        U
UUU(W2WCSI.nU$ s  snf s  snf s  snf ! , (       d  f       GN= fs  snf s  snnf ! , (       d  f       GN*= fs  snnf s  snf ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= fs  snf s  snf s  snf ! , (       d  f       GNN= f! , (       d  f       GNX= f! , (       d  f       GNb= f! , (       d  f       GNl= fs  sn*n)f ! , (       d  f       G	N= f! , (       d  f       G	N= f)KNr~   r   r`   r  TleftFr  r]   r_  )protected_tokens)skip_special_tokensclean_up_tokenization_spacesz^(z)+rv  r   zvLLM.generaterm   r}   )
r  nr   r   r   r   r   
max_tokensr  r  r   )from_processr4   r   )regex)r  r   r   r   r   r   r  guided_decoding)group)sampling_paramsuse_tqdmr   )padding_valuerb  )r  paged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r  )r  progress_barr  )r  r  ztransformers.generate)r  disable_compiler  )r9  )r  role	assistantcontent)r  r  )minmax)rc  keepdimg-C6?
num_tokens      ?r  klzcompletions/mean_lengthzcompletions/min_lengthzcompletions/max_lengthzcompletions/clipped_ratioz"completions/mean_terminated_lengthz!completions/min_terminated_lengthz!completions/max_terminated_lengthzrewards/z/meanz/stdr  
reward_stdfrac_reward_zero_stdr   r   r   )
prompt_idsprompt_maskro  completion_mask	old_logpsr   r   )r   r   rR   r7  copydeepcopyr+   rW   r   r  r   rF   batch_decoderesubescaper   r   r  global_stepr  r  r   r   r   r   r,   r  generater   r   r   r   r   r   r  rT   r  r   r   slicer  rL   r  rK   r   r  r   r	  all_gather_objectr  r  outputs	token_idsget_rankr   r@   r   rg  r   model_wrappedr   _attn_implementationr%   r3   ds3_gather_for_generationrf  r!  r  r  r   bf16r  bfloat16fp16float16r  generate_batchr]   r  r   generated_tokensrd  r   fulllongintargmaxr   arangeexpand	unsqueezer   booltolistsumr   r  per_device_eval_batch_sizerr  r   r   unwrap_modeldisable_adapterr*   popr   r  r   nansumr   clampr   viewmeanstdisclose
zeros_liker   clonenum_input_tokens_seenitemr   nanmeanfloatr  r  r  r   r   r?   r   extend)Pr%  r  r   r  r  r  original_promptsrd   prompts_textprompt_inputsr  r  r  all_prompts_textordered_set_of_promptsro  process_slicer  r  r  	orig_sizer/  gathered_promptssublistr  vllm_inputsall_outputsr  outputlocal_rank_in_grouptp_sliceidsprompt_completion_idspaged_prompt_inputsprevious_attnunwrapped_modelprompt_lengthis_eoseos_idxsequence_indicesr  rowmask_rowr  completion_lengthstruncated_completionsr_  r`  r9  old_per_token_logpsr  ref_per_token_logpscompletions_textr  r`   r   	bootstrapr  r   per_token_klr  grouped_rewardsmean_grouped_rewardsstd_rewardsis_std_zerogrouped_sum	baselinesr   all_process_advantagesmean_klagg_completion_lengthsagg_terminated_with_eosterm_completion_lengthsclipped_completions_ratior,  r  mean_rewardsstd_func_rewardsrx  r0  sP                                                                                  rf   r  +RLOOTrainer._generate_and_score_completions  s    !!((**--w6(./1X;/
  ==1kqrkq`g1';P;PQRZ[kqr--$ . 
 />"/"<mL\>]K!!- 'E)?)?RT'#J  00==TY > L _kk^jVZBFFb4>>)B(C2#FDQ^jLk ===zz%%)?)??((*)-)?)?& ~~)#0#> ##33 .>>UAUAU>U-V**4A)-)9)9)B)B$:"22/3/F/F(,(8(8"&**(,

(:"

)-);#'+'A'A262L2L.2ii.I.I *C * BA '+Vc2B.C%CN "7~TU!V %$$22S\A%%33a73w<G! "0!> :---&:A[A[&\O&*O *.*A*A#'#3#3!ZZ#'::#5R4::$(JJ$6SDJJ"&"<"<'6	%! 99..:%,,TYY-H-HI"0"E3D"E11A5 !$L 1I6;D<Z<Z6['\6[6[$'\%%778H,^k^k7l9I'[9IgSZaSZ9I$'[$ (4$.&t_="&(("3"3KQ`kp"3"qK > CN!l+w\c\k\kRX&"2"2\k"2+!l11A5 +0*;*;*D*D4==*D*Y'$%89%DGZ]^G^bkFklH%3H%=N KYY.3ell3v>.NY ?P?PQN$)IIz.JPQ$R!((("&"7"7\"7"J ..55JJM(**AR""))>AM""))>!$(EF+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuu 99>>#&&u~~6YY^^#&&u}}5))+"1"@"@+55I_I_ns #A #K , v   G  EPDVDVDXYDX&f55DXNYJXY.3ell3v>.NY ?P?P_fgNFYFcFcdFcs%,,s6:FcJdZ7H7HW]^J$)IIz.JPQ$R!=JD%%: "$(?@+&&(8(8SWS\S\SvSv$NRNbNb''(:(:EJhshuuNXKk*M:J,K(7(@(@ )#)7;7M7M_c)% v   A 'OOA.M.q.=./@AJ21mn3DEN  #4#44**fkk!n.AejjY_`%+ZZ\%8%8Q%8%?

q
@Q%R

q
!" <<AvFMMfkkZ[n^`a+w/@/@/CCHHJ LO~_n_s_s_uKvwKv-#xs8}335Kvw -003 **%+ZZAZ%6$6!-2G1G0R0RST0U0Y0Y0[[O K#AqI',,Q/>BgoTYY::SWS\S\SwSw
]]_%)%L%L

%&" ->CCAFI yyC>>--1-T-T-&&#- .U .*' ))66tzzBRRT151X1X JJ1**'1 2Y 2.+Q UT '+#? D  00==nbf=gVAY''K&)'3C&D"
7=bz&7I[7XFJJL3^`	""[YQ[E[$\#]^ 'E +K
  226[Zmn $d&9&9&<&<V&D&N&Nq&QQYY^_Y` !!mm(>(>q(AtG]G]^_G`maG 99.1DDL055b9BB		B.G!,,r4+?+?@.333:%))a)0mmK1A1A+1NO &))a)> ?2t7K7Ka7OP	NN2&	y(
 $$$z'88Z^^=MPT=TUJ **S\9++a/3w<?
 ",!1!1!3.
 7?JJ,,0@0@0G0GHZHZH\0]0a0a0c0h0h0jj,-1ZZ-M-M,NdL) 99#o5::<?R?R?T?Z?Z_b?Z?ccGMM$%,,T-=-=-D-DW-M-U-U-W-\-\-^_ "&!1!1!8!89K!Ld56==>T>Z>Z>\>a>a>c>h>h>jkd45<<=S=Y=Y=[=_=_=a=f=f=hid45<<=S=Y=Y=[=_=_=a=f=f=hi #'"2"2"9"9&***:K"L"89P"Q$%,C(DsKaGb(b$b!d78??@YZ&'1,&+kk!F&C#d@AHHI`IfIfIhImImIoItItIvwd?@GGH_HeHeHgHkHkHmHrHrHtud?@GGH_HeHeHgHkHkHmHrHrHtu $-T-C-C#DA ==)9!Q$)?@EEGLMM$(+;*<E BCJJ<X%&6q!t&<=BBDMM$(+;*<D ABIIJZ[	 $E
 	dH%,,-A-F-F-H-M-M-OPdL)001A1A1C1H1H1JKd23::;;L;L;N;S;S;U;Z;Z;\] 	

8##M,$?@

< ''6F(GH !7!78GAtJJy!$'../?1/E/L/L/NO 9

< ''(>(E(E(GH %&,."$
 i
 0 s. l& BAb (]'[ >= "m Z4 ,+ vu    GF  ZYd vu    A@2 xL UT- _s  AR%"AR#7AAR(3B%AR-&AR?!ASAS
4*AS6AS"4AT/	AT;ATBAS9	%&AS'AS9	ATAT#AT/AU!AU$AU4AVAU4";AU",AU		AU"AU4AV! AVB(AV00!AVAV0R-
AR<S

ASS'
AS6S1AS9	S9
ATTATT
ATTATT
AT,	T'AT/T/
AT>U
AUUAU"U"
AU1U,AU4U4
AV	U>AVV
AVV
AV-	V(AV0V0
AV?c                 H    U(       a  [        S5      eU R                  X5      $ )Nz2The RLOOTrainer does not support returning outputs)r   _compute_loss)r%  rR   r  return_outputsnum_items_in_batchs        rf   compute_lossRLOOTrainer.compute_loss4  s"    QRR!!%00ri   c                 P   US   US   pCUS   US   pe[         R                  " X5/SS9n[         R                  " XF/SS9nUR                  S5      n	U R                  UUUU	SS9u  pX-  R	                  S5      nUS	   nX-
  nUS
   n[         R
                  " U5      n[         R                  " USU R                  -
  SU R                  -   5      nUU-  nUU-  n[         R                  " UU5      * nUR                  5       nU R                  R                  (       a  SOSnX-  R	                  5       UR	                  5       R                  SS9-  nU R                  U   S   R                  U R                  R!                  U5      R#                  5       R%                  5       5        USU R                  -
  :  US:  -  nUSU R                  -   :  US:  -  nUU-  nU R                  R!                  UR'                  5       R                  5       5      nU R                  U   S   R                  UR#                  5       R%                  5       5        U R                  U   S   R                  [)        U5      R%                  5       5        U R                  R!                  UR'                  5       R                  5       5      nU R                  U   S   R                  UR#                  5       R%                  5       5        U R                  U   S   R                  [+        U5      R%                  5       5        U R                  R!                  UR'                  5       R                  5       5      nU R                  U   S   R                  UR#                  5       R%                  5       5        U$ )Nr  r  ro  r  r4   rb  T)rh  r  r   r~   r   r  r  entropyr   zclip_ratio/low_meanzclip_ratio/low_minzclip_ratio/high_meanzclip_ratio/high_maxzclip_ratio/region_mean)r   rg  rd  rr  r%  expr+  r   r   r  r-  rR   r7  r   r   r   r   r4  r3  r5  r>   r=   )r%  rR   r  r  r  ro  r  r]   r_  r`  per_token_logpsrq  rp  r  	log_ratior   coef_1coef_2per_sequence_loss1per_sequence_loss2per_sequence_losslossr  mean_entropyis_low_clippedis_high_clippedis_region_clippedgathered_low_clipgathered_high_clipgathered_clip_ratios                                 rf   rg  RLOOTrainer._compute_loss:  s_   "("6}8MK*01A*BFK\D]IIz:B	K#AqI',,Q/ &*%L%L  &M &
" !277:;'	%	 L)
9%VQ)9)9%91t?P?P;PQ#j0#j0"YY'9;MNN %%' **--w6 "388:_=P=P=R=X=X]`=X=aadI&--d.>.>.E.El.S.[.[.].b.b.de !1t'7'7#77JNK!A(9(9$99j1nM*_< ,,33N4H4H4J4O4O4QRd1299:K:S:S:U:Z:Z:\]d0188@Q9R9W9W9YZ!--44_5J5J5L5Q5Q5STd23::;M;U;U;W;\;\;^_d1299&AS:T:Y:Y:[\"..556G6M6M6O6T6T6VWd45<<=P=X=X=Z=_=_=abri   ignore_keysc                 >   U R                  U5      n[        R                  " 5          U R                  5          U R	                  X5      nS S S 5        WR                  5       R                  5       nS S S 5        WS S 4$ ! , (       d  f       N9= f! , (       d  f       N$= fru   )r  r   rf  compute_loss_context_managerrj  r-  detach)r%  rR   r  prediction_loss_onlyr  rv  s         rf   prediction_stepRLOOTrainer.prediction_stepl  su    %%f-]]_224((7 599;%%'D  T4 54 _s"   BA=
&B=
B	B
Blogs
start_timec           	      .  > U R                   R                  (       a  SOSnU R                  U   R                  5        VVs0 s H  u  pEU[	        U5      [        U5      -  _M     nnnUS:X  a(  UR                  5        VVs0 s H  u  pESU 3U_M     nnn0 UEUEn[        T
U ]  X5        U R                  U   R                  5         U R                  R                  (       Ga  U R                  (       Ga  [        5       (       ab  [        U R                  S   U R                  S   U R                  S   U R                  S   U R                  R                   U R"                  5        U R$                  R&                  (       Ga  SU R$                  R&                  ;   a  [(        R*                  b  S	S Kn[/        U R                  R                   5      /[        U R                  S   5      -  U R                  S   U R                  S   S
.U R                  S   ESU R                  S   0EnUR1                  U5      n	U R2                  (       a  U	R5                  S/S9n	[(        R                  " S[(        R6                  " U	S905        g g g g g g s  snnf s  snnf )Nr~   r   eval_r`   r   r   r   wandbr   )stepr`   r   	advantage)subsetr  )	dataframe)rR   r7  r   r  r%  r   r   logclearr   r   r   r'   rA   r   r  r  r   rT   	report_tor  runpandasr   	DataFramer   drop_duplicatesTable)r%  r  r  r  r  valmetricspdtabledfr0  s             rf   r  RLOOTrainer.logt  s0   **--w6<@MM$<O<U<U<WX<W3C3s8++<WX 6>:A--/J/hcse}c)/GJ"$"'"D%d!!#+++0D0D0D ""/JJx(JJ|,JJy)JJ|,JJ**11 yy"""w$))2E2E'E%))J_# !!7!789C

8@T<UU"jj2"&**\": jj+	
  L!9 \\%(00++H:+>B		=%++*CDE K`'E" 1E+ Y
 Ks   $JJc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nrl   rm   )r)  )	rT   hub_model_idr   
output_dirrx  r   create_model_cardr   _save_checkpoint)r%  rR   trialr)  r0  s       rf   r  RLOOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .ri   r)  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr   unsloth_versionunslothJOB_IDhf_jobsaD              @inproceedings{ahmadian2024back,
                title        = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}},
                author       = {Arash Ahmadian and Chris Cremer and Matthias Gall{'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {"{U}}st{"{u}}n and Sara Hooker},
                year         = 2024,
                booktitle    = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024},
                pages        = {12248--12267},
                publisher    = {Association for Computational Linguistics},
                editor       = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar},
            }
            RLOOz`Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMsz
2402.14740)
base_modelr)  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror   rR   r   r  pathisdirr   r  rv   r   r  r  r  r  textwrapdedentr:   r  r"   r  r  urlr;   savejoinrT   r  )r%  r)  r  r  r  citation
model_cards          rf   r  RLOOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ??

 )!!**%'9';';		@Ueiimm[_.0%z!

 	TYY%9%9;GHri   ).r   r  r   r   r3  r   r   r   r   r   r   r  r  r  r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   rS   rX   r   r   r   r   r   r  r   r   r  r   r   r   r   )NNNNNNNN)NNNNNNNNru   )NF)rv  N)FN)NNN)5r   
__module____qualname____firstlineno____doc__r  r   r   r   
RewardFuncr   r
   r6   r   r   r   r   r   r!   tupler   optim	Optimizerlr_schedulerLambdaLRr   r4  rQ  r   rL  r\  r-   Tensorrr  r{  r   r   r  r  r  r   r  r  r  rj  rg  r  r5  r  r  r  __static_attributes____classcell__)r0  s   @rf   rN   rN   d   s   cJ J
 .2<@%)CGnrUYmq59jv.2%g S/)*g JZ(889	g
 z"g  g&> ?@g uWotCwXgOgIhDh?i%ijkg #5)@.)P#QRg $,E2I4PgKh2h,i#jg D12g (5;;#8#898EKKD\D\DeDe;ffgg l+g gR1$X<"
(7*; "
w "
H
 
  /  
c8ELL))	*/  / bHT#Y<O J JC J68 8 I* I*V   $S%c0A*B%B C 	c5s*++	,   D 4  4 lZ4U5<<+<%= =>?Z	c5s*++	,Zx
 1 1
0d PXY]^aYbPc  &FS%Z( &Fhuo &FQU &F &FR/ %)&*,0	FISMFI smFI CcD()	FI FIri   rN   )qr  r   r  r	  r  r   collectionsr   r   
contextlibr   	functoolsr   pathlibr   typingr   r	   r
   r   rC  r   torch.utils.datar   
accelerater   accelerate.utilsr   r   r   r   r   r   r   r   torch.distributed.fsdpr   r  r   r   r   r   r   r   r   r   r   r   r    r!   r"   transformers.trainer_utilsr#   transformers.utilsr$   r%   r&   r'   
data_utilsr)   r*   r+   extras.profilingr,   r-   extras.vllm_clientr.   import_utilsr/   modelsr0   r1   r2   r3   rY   r5   rloo_configr6   rJ  r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   peftrH   rI   vllmrJ   rK   vllm.sampling_paramsrL   r  
get_loggerr   r   r   r   r5  r  rN   r   ri   rf   <module>r     s     	 	   * "   1 1      b b -  C 0    3 u u Z Z E + , e e + #    * * (9 
		H	% 34,U2K)LLM
GI' GIri   