
    h5                    h   S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	  S SK
JrJrJrJr  S SKrS SKrS SKJr  S SKJs  Jr  S SKrS SKJr  S SKJrJrJr  S SKJr  S S	KJ r   S S
K!J"r#  S SKJ$r$J%r%  S SK&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2  S SK3J4r4  S SK5J6r6J7r7  S SK8J9r9  S SK:J;r;J<r<J=r=  SSK>J?r?J@r@JArA  SSKBJCrC  SSKDJErE  SSKFJGrG  SSKHJIrIJJrJ  SSKKJLrL  SSKMJNrN  SSKOJPrP  SSKQJRrRJSrSJTrTJUrUJVrVJWrWJXrXJYrYJZrZ  \<" 5       (       a  S SK[J\r\J]r]  \1" 5       (       a  S SK^J_r_  \=" 5       (       a,  S SK`Jarb  \ R                  " \b5      \ R                  " S5      :  rdOS rd\G" 5       (       a  S S!KeJfrfJgrg  S S"KhJiri  \2" 5       (       a  S SKjrj\R                  " \l5      rm\\n\,\\o\o/\o\p   4   4   rq " S# S$\/5      rrg)%    N)nullcontext)wraps)Path)AnyCallableOptionalUnion)logging)broadcast_object_listgather_objectis_peft_model)Dataset)version)FullyShardedDataParallel)
DataLoaderIterableDataset)AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizerDataCollatorGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_apex_availableis_wandb_available)*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)EvalPredictionseed_worker)OptimizerNames)is_flash_attn_2_availableis_peft_availableis_sagemaker_mp_enabled   )apply_chat_templateis_conversationalmaybe_apply_chat_template)profiling_context)
VLLMClient)is_vllm_available)create_reference_modelprepare_peft_model)unwrap_model_for_generation   )BasePairwiseJudge)OnlineDPOConfig)	SIMPLE_CHAT_TEMPLATEDPODataCollatorWithPaddingdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_urlpadprepare_deepspeedtruncate_right)
PeftConfig	PeftModel)amp)__version__z1.10F)LLMSamplingParams)GuidedDecodingParamsc            %         ^  \ rS rSrSrSS/r                S:S\\\R                  \
4   S\\\R                  S4   S\\\\\   4      S	\\   S
\\   S\\   S\\\\4      S\\\\\\
\\\4   4   4      S\\\\4      S\\\\\   4      S\S   S\\\/\4      S\\\      S\\R6                  R8                  \R6                  R:                  R<                  4   S\\\R>                  \R>                  /\R>                  4      S\\\\R                  4      S\\   SS4$U 4S jjjr \!S 5       r"\#S\$S\S\\
\%4   4S j5       r&\'" \(RR                  5      S\*4S j5       r)\'" \(RV                  5      S;S\\\
\4      S\*4S jj5       r+S\S
\S\4S  jr,S;S! jr-S;S" jr.S;S# jr/S$ r0S<S%\R                  S&\
4S' jjr1S%\R                  4S( jr2S;S)\\\
      4S* jjr3 S;S+\\
\\\R>                  4   4   S\\
\\4   4   4S, jjr5S;S- jr6S. r7S;S/ jr8 S;S\R                  S0\\
\\R>                  \%4   4   S1\\4   S\R>                  4S2 jjr9 S;S3 jr:U 4S4 jr;   S=S5\\
   S6\\
   S7\\
\\
   S4   4S8 jjr<S9r=U =r>$ )>OnlineDPOTrainerm   af  
Initialize OnlineDPOTrainer.

Args:
    model (`Union[str, nn.Module, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
        The reference model to use for training. If None is specified, the reference model will be created from the
        model.
    judge (`BasePairwiseJudge`):
        The judge to use for pairwise comparison of model completions.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`, *optional*, defaults to `None`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
          custom callable function.
        - A list of reward functions: Must all be of compatible types.

        Note: Only one of `judge`, or `reward_funcs` should be provided.
    args (`OnlineDPOConfig`):
        The online DPO config arguments to use for training.
    data_collator (`transformers.DataCollator`):
        The data collator to use for training. If None is specified, the default data collator
        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
        sequences in the batch, given a dataset of paired sequences.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        The dataset to use for training.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If provided, will be used to automatically process the inputs
        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
        reuse the fine-tuned model.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.

        If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`].
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.
    callbacks (`list[transformers.TrainerCallback]`):
        The callbacks to use for training.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.

.. deprecated:: 0.22.0
    The following parameters are deprecated and will be removed in a future version:

    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
trlz
online-dpoNmodel	ref_modelreward_funcsjudgeargsdata_collatortrain_dataseteval_datasetprocessing_classreward_processing_classespeft_configr<   compute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreward_modelreward_processing_classreturnc                   > X!L a  [        S5      eX l        Ub2  [        R                  " S5        Uc  UnO[        R                  " S5        Ub2  [        R                  " S5        U
c  Un
O[        R                  " S5        [	        S XC4 5       5      nUS:X  a  [        S5      eUS	:  a  Ub  [
        R                  S
[        5        S nX@l        UGb  [        U[        5      (       d  U/n/ U l        UR                  =(       d    0 n[        U5       H  u  nn[        U[        5      (       a  [        R                   " U4SS	0UD6UU'   [        UU   ["        R$                  5      (       aF  U R                  R'                  UU   R(                  R*                  R-                  S5      S   5        M  U R                  R'                  UU   R.                  5        M     X0l        U
c  S /[3        U5      -  n
O<[        U
[        5      (       d  U
/n
O#[3        U
5      [3        U5      :w  a  [        S5      e/ U l        [7        X5       H  u  nn[        U[8        5      (       af  Uc*  [:        R                   " UR(                  R*                  5      nUR<                  c  UR>                  Ul         UR<                  UR(                  l        U R4                  R'                  U5        M     OS U l        / U l        / U l        Ub  URB                  b  [3        URB                  5      [3        U R0                  5      :w  a8  [        S[3        URB                  5       S[3        U R0                  5       S35      e[D        RF                  " URB                  [D        RH                  S9U l!        OC[D        RJ                  " [3        U R0                  5      [D        RH                  S9U l!        OS U l!        URL                  b.  Uc+  Uc(  Ub  [
        R                  S[N        SS9  O[        S5      eUc  [        S5      eU	c  [        S5      eUR                  =(       d    0 n[        U[        5      (       a  UnURQ                  S5      n[        U[D        RR                  5      (       d	  US:X  d  Uc  O:[        U[        5      (       a  [U        [D        U5      nUUS'   O[        SU S35      e[V        R                   " U40 UD6nOUR                  b  [        S5      eUR(                  RX                  U l,        UR(                  RZ                  [\        R^                  " 5       ;   U l0        Uc$  [c        5       (       a!  [        U[d        5      (       a  [g        XU5      nURh                  (       a  U Rk                  X5      nURl                  (       a-  [o        U5        U R                  b  [o        U R                  5        Uc  Uc  [q        U5      U l        O(S U l        O X l        U R                  Rs                  5         Ub0  U H*  n[        U[8        5      (       d  M  URs                  5         M,     URt                  U l:        / / / / / / / / / / / S.U l;        U R0                  b-  / U Rv                  S'   / U Rv                  S '   / U Rv                  S!'   URx                  U l<        SU l=        UR|                  U l>        UR~                  U l?        UR                  U l@        UR                  U lA        UR                  U lB        UR                  U lC        URx                  (       a  UR                  OS U lD        UR                  U lE        UR                  U lF        UR                  U lG        [        U	[        5      (       a  U	R                  nO#[        U	[        5      (       a  U	nO[        S"5      eUR@                  c  UR>                  Ul         UR@                  U l         UR<                  U l        UR                  U lL        [U        U	S#S 5      U lM        [U        U	S$S 5      U lN        [U        U	S%S 5      U lO        S U lP        U R                  b!  UR                  U R                  /5      U lP        Uc  [        U R<                  S&9nS'UR                  S('   [        T U G]U  UUUUUU	UUUUS)9
  [        U R                  S*5      (       a%  U R                  R                  U R                  5        UR                  U l[        U Rx                  (       Ga  [        5       (       d  [        S+5      eU R                  S,:X  a  U R                  R                  (       a  UR                  b  UR                  nOS-UR                   S.UR                   3n[        UUR                  S/9U le        U R                  R                  [D        R                  R                  5       S09  GOS U le        GOU R                  S1:X  Ga  UR                  U R                  U R                  U R                  U R                  R                  U R                  -  URt                  UR                  -   S2U R                  R                  U R                  -  S3S4.	n[        U R                  R                  5      [        R                  S5'   [        U R                  R                  5      [        R                  S6'   [        U R                  R                  5      [        R                  S7'   [        R                  RQ                  S8S95      [        R                  S8'   [        R                  RQ                  S:S;5      [        R                  S:'   [        SL0 UD6U ls        O[        S<U R                   S=35      eUR                  U lu        SU lv        SU R                  U R|                  U R~                  U R                  c  SOU R                  U R                  c  S>OU R                  UR                  S?S@.nUR                  b  UR                  UR                  5        U R                  (       a  [        U R                  SA9USB'   [        SL0 UD6U l{        U R                  R                  5         GOUR                  S'U R<                  UR                  U R                  U R|                  U R                  U R~                  U R                  U R                  Rh                  (       d  S'OS?SC.
nU R                  b  U R                  USD'   UR                  b  UR                  UR                  5        U R                  (       a  SEUSF'   SGUSH'   SIUSJ'   UR                  5        VVs0 s H  u  nnUc  M  UU_M     nnn[        SL0 UD6U l{        U GR                   (       a  U R                  b>  G[        U R                  UR                  UGR                  UGR                  5      U l        U R0                  b[  [        U R0                  5       HA  u  nn[        U[8        5      (       d  M  G[        UU R                  5      U R0                  U'   MC     g g U R                  b6  U R                  GR	                  U R                  GR
                  5      U l        U R0                  b`  [        U R0                  5       HF  u  nn[        U[8        5      (       d  M  U R                  GR                  US'S'SK9U R0                  U'   MH     g g s  snnf )MNz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, either omit the `ref_model` argument or pass `None`.zThe `reward_model` parameter is deprecated and will be removed in version 0.25.0. Please use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.zfBoth `reward_model` and `reward_funcs` are provided. Using `reward_funcs` and ignoring `reward_model`.zThe `reward_processing_class` parameter is deprecated and will be removed in version 0.25.0. Please use `reward_processing_classes` instead. For example, change `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.zBoth `reward_processing_class` and `reward_processing_classes` are provided. Using `reward_processing_classes` and ignoring `reward_processing_class`.c              3   (   #    U  H  oS Lv   M
     g 7fN ).0xs     X/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/online_dpo_trainer.py	<genexpr>,OnlineDPOTrainer.__init__.<locals>.<genexpr>   s     J4Iqd]4Is   r   z2One of `judge` or `reward_funcs` must be provided.r0   zXBoth `judge` and `reward_funcs` are provided. Using `judge` and ignoring `reward_funcs`.
num_labels/zRThe number of reward processing classes must match the number of reward functions.zNumber of reward weights (z)) must match number of reward functions ()dtypezThe `missing_eos_penalty` parameter is deprecated when used with the deprecated `reward_model` parameter. Please use `reward_funcs` instead of `reward_model` to continue using this feature.r&   )
stacklevelzH`missing_eos_penalty` is only supported when `reward_funcs` is provided.z`args` must be provided.z$`processing_class` must be provided.rg   autozInvalid `dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .zYou passed `model_init_kwargs` to the `OnlineDPOConfig`, but your model is already instantiated. This argument can only be used when the `model` argument is a string.)objective/klobjective/entropyobjective/non_score_rewardrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/contain_eos_tokenbetaobjective/rlhf_rewardobjective/scores_marginobjective/scoreszWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`image_token_idvision_start_token_idvision_end_token_id)pad_token_idTestimate_tokens)
rG   rK   rL   rM   rN   rO   rR   rS   rT   rU   add_model_tagszkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutdevicecolocateexternal_launcheri   )	rG   tensor_parallel_sizegpu_memory_utilization
model_implmax_num_seqsmax_model_lendistributed_executor_backendseedmax_num_batched_tokensRANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345z6vllm_mode must be either 'server' or 'colocate', got 'z'.        F)nrepetition_penaltytemperaturetop_ptop_kmin_p
max_tokens
detokenize)regexguided_decoding)
max_new_tokens	do_sampler|   bos_token_ideos_token_idr   r   r   r   	use_cacher   i   max_batch_tokensi   
num_blocks   
block_size)evaluation_modedevice_placementr\   )
ValueErrorrH   warningswarnsumloggerwarningUserWarningrJ   
isinstancelistreward_func_namesmodel_init_kwargs	enumeratestrr   from_pretrainednnModuleappendconfig_name_or_pathsplit__name__rI   lenrP   zipr   r   r|   	eos_token	pad_tokenreward_weightstorchtensorfloat32onesmissing_eos_penaltyDeprecationWarninggetrg   getattrr   is_encoder_decoder
model_typer   keysis_vision_modelr$   r=   r.   gradient_checkpointing_enable_gradient_checkpointingdisable_dropoutr5   r-   eval
max_lengthstatsuse_vllmnum_generationsr   r   r   r   r   use_transformers_paged	vllm_modevllm_gpu_memory_utilizationvllm_tensor_parallel_sizevllm_model_implr   	tokenizerr   	TypeErrorr   ry   rz   r{   image_tokendecoder4   warnings_issuedsuper__init__hasattrrG   r~   
_tag_namesru   _betar,   ImportErroracceleratoris_main_processvllm_server_base_urlvllm_server_hostvllm_server_portr+   vllm_server_timeoutvllm_clientinit_communicatorcudacurrent_devicename_or_pathrK   per_device_train_batch_sizer   process_indexosenvironlocal_process_indexnum_processesr@   llmvllm_guided_decoding_regexguided_decoding_regex_last_loaded_stepgeneration_kwargsupdaterB   rA   generation_configwait_for_everyoner   itemsr   is_deepspeed_enabledr:   fp16bf16tor   prepare_model)!selfrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   reward_configsr   ireward_funcreward_processing_class_imodel_idrg   r   r   vllm_kwargsgeneration_paramsr   kv	__class__s!                                   r_   r   OnlineDPOTrainer.__init__   ss   * X 
 # #MMw
 #+&
 #.MM` )0,C)Z JU4IJJQQRRa n  $
 #lD11 ,~%'D" !% 6 6 <""+L"9;k3//&H&X&X#'01'5F'LO l1oryy99**11,q/2H2H2V2V2\2\]`2abd2ef**11,q/2J2JK #: !- )0-1FS5F,F) 94@@-F,G)01S5FF$l  .0D*:=>W:f6);k?;;084A4Q4QR]RdRdRrRr4s10==E>W>a>a1;6O6\6\K&&3..556OP ;g !%D%'D"-/D* #"".t**+s43D3D/EE$4S9L9L5M4N O&&)$*;*;&<%=Q@  ',ll43F3Femm&\#&+jjT5F5F1Gu}}&]#"&D##/L4HU]'j& 	   !!kll<788 #CDD 228beS!!H &))'2E%--&EME3''u--2!'* OOTgUVX 
 )88WEVWE%%1 \  #(,,"A"A$||66:d:i:i:kk"'8':':z%QZ?[?[&u4@E &&77DE $U+~~)(8 "!7!>!%&NNN! #+k?;;$$&  , // !#*,  ""$! %'

 (24DJJ./46DJJ01-/DJJ)*  ++ZZ
ZZ
ZZ
"&"9"9&*&A&A#+/==d+/+K+K()-)G)G&#33 &77(22I(*ABB(Iuvv&"+"5"5I",,%22%22 &&68H$O%,-=?VX\%]"#*+;=RTX#Y *(//1D1D0EFD  6DDUDUVM 48/0''%-+!*G 	 	
 4::/00JJ%%doo6YY
 ===$&&!4 
 ~~)##3300<#'#<#<%,T-B-B,C1TEZEZD[#\'18X\XpXp'qD$$$66ejj>W>W>Y6Z'+D$:- #//,0,J,J.2.N.N"&"6"6$(II$I$IDLjLj$j%)__t7J7J%J4G ,,::d>\>\\.2 &))9)9)G)G%H

6"+.t/?/?/S/S+T

<(+.t/?/?/M/M+N

<(,.JJNN=+,V

=),.JJNN=',R

=)-- #YZ^ZhZhYiik!lmm)-)H)HD&%'D" &*&=&=#//#zz1tzz $

 2

"11#	! %%1!(()?)?@))7KRVRlRl7m!"34%3%H6G%HD"
 ..0 #'"5"5! $ 1 1 ) 6 6 $ 1 1#//&*&=&=)-)I)ITu! zz%-1ZZ!'*%%1!(()?)?@**8;!"4526!,/25!,/2C2I2I2K ]2K$!QqA2K ]%5%J8I%JD"$$$~~)!2NND$D$DdiiQUQZQZ"   ,&/0A0A&BNA{!+??/@dN^N^/_))!, 'C -
 ~~)!%!2!243C3C3J3J!K  ,&/0A0A&BNA{!+??/3/?/?/M/M'PT 0N 0))!, 'C -# !^s   '
}5}c                     [        U R                  [        5      (       aM  U R                  R                  nU[        U R                  5      :  a  U R                  U   $ U R                  S   $ U R                  $ )Nrd   )r   r   r   stateepochr   )r	  r  s     r_   ru   OnlineDPOTrainer.beta[  sX    djj$''JJ$$E(-DJJ(?4::e$STZZPR^S::    r   r   c                 L   U(       dd  U" U S   SS9nUR                   bL  [        US   5      nUS:X  d  UR                   US   S   :w  a"  UR                   /US   -   US'   S/US   -   US'   O
U" U S   SS9nUR                  5        VVs0 s H  u  pVS	U 3U_M     nnnU$ s  snnf )
z2Tokenize a single row from a DPO specific dataset.promptF)add_special_tokens	input_idsr   r0   attention_maskTprompt_)r   r   r  )featurer   r   batchprompt_len_input_idskeyvalues          r_   tokenize_rowOnlineDPOTrainer.tokenize_rowc  s     "gh/EJE%%1'*5+='>$'1,	0F0F%P[J\]^J_0_*3*@*@)AE+DV)VE+&/0cE:J4K.KE*+gh/DIE:?++-H-JC73%%'-H Is   	B c                 J   U R                   c  [        S5      eU R                   nU R                  nU R                  UU R                  R
                  U R                  R                  U R                  R                  S.n[        U[        R                  R                  R                  5      (       dN  U R                  5       US'   U R                  R                  US'   [        US'   U R                  R                   US'   U R"                  R%                  ['        U40 UD65      $ )Nz+Trainer: training requires a train_dataset.
batch_size
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_lastworker_init_fnprefetch_factor)rM   r   rL   _train_batch_sizerK   dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersr   r   utilsdatar   _get_train_samplerdataloader_drop_lastr!   dataloader_prefetch_factorr   preparer   )r	  rM   rL   dataloader_paramss       r_   get_train_dataloader%OnlineDPOTrainer.get_train_dataloadert  s    %JKK****00'99;;))99"&))"I"I
 -)9)9)I)IJJ+/+B+B+Di(-1YY-K-Kk*2=./37993W3W/0''
=(VDU(VWWr  c                 (   Uc  U R                   c  [        S5      e[        U[        5      (       a  UOSn[	        U S5      (       aR  X R
                  ;   aC  U R                  R                  (       a(  U R                  R                  U R
                  U   5      $ [        U[        5      (       a  U R                   U   OUb  UOU R                   nU R                  nU R                  R                  UU R                  R                  U R                  R                  U R                  R                  S.n[        U[        R                  R                   R"                  5      (       dF  U R%                  U5      US'   U R                  R&                  US'   U R                  R(                  US'   [+        U40 UD6nU R                  R                  (       a(  [	        U S5      (       a  XPR
                  U'   OX%0U l        U R                  R                  U5      $ )Nz-Trainer: evaluation requires an eval_dataset.r   _eval_dataloadersr(  r.  r/  r1  )rN   r   r   r   r   r@  rK   r5  r   r;  rL   eval_batch_sizer3  r4  r   r6  r7  r   _get_eval_samplerr9  r:  r   )r	  rN   dataloader_keyrL   r<  eval_dataloaders         r_   get_eval_dataloader$OnlineDPOTrainer.get_eval_dataloader  s   D$5$5$=LMM *4L#)F)FFD-.."8"88		77##++D,B,B>,RSS ,,, l+ ' "" 	 ** ))33'99;;))99"&))"I"I
 ,(8(8(H(HII+/+A+A,+Oi(-1YY-K-Kk*37993W3W/0 %\G5FG9922t0119H&&~6*8)J&''88r  c                    SUR                   l        [        U5      (       a  UR                  R	                  5         OUR	                  5         UR
                  =(       d    0 nSU;  =(       d    US   nU(       a  UR                  5         U$ )z-Enables gradient checkpointing for the model.Fuse_reentrant)r   r   r   
base_modelgradient_checkpointing_enablegradient_checkpointing_kwargsenable_input_require_grads)r	  rG   rK   rK  rH  s        r_   r   /OnlineDPOTrainer._enable_gradient_checkpointing  s~     "' ::< //1(,(J(J(Pb%#@@rDabqDr 	 ,,.r  c           	      @   U R                   nU R                  nU R                  S:X  a  U R                  X5      u  pVO#U R                  S:X  a  U R	                  X5      u  pV[        S W 5       5      nU Vs/ s H%  nS/U[        U5      -
  -  S/[        U5      -  -   PM'     n	nU Vs/ s H  o/U[        U5      -
  -  U-   PM     nnU R                  R                  n
W Vs/ s H%  nS/[        U5      -  S/U
[        U5      -
  -  -   PM'     nnU Vs/ s H"  nUS   U:w  a  [        U5      U
:  a  X/-   OUPM$     nnU Vs/ s H  oU/U
[        U5      -
  -  -   PM     nn[        R                  " X`R                  R                  S9n[        R                  " XR                  R                  S9n	[        R                  " XPR                  R                  S9n[        R                  " XR                  R                  S9nXiX[4$ s  snf s  snf s  snf s  snf s  snf )Nr   r   c              3   8   #    U  H  n[        U5      v   M     g 7fr[   )r   )r]   idss     r_   r`   2OnlineDPOTrainer._generate_vllm.<locals>.<genexpr>  s     ?JSCJs   r   r0   rd   r   )r   r|   r   _generate_vllm_server_generate_vllm_colocatemaxr   r  r   r   r   r   r   )r	  promptsimagesr   r|   completion_ids
prompt_idsmax_prompt_lengthrP  prompt_maskr   completion_masks               r_   _generate_vllmOnlineDPOTrainer._generate_vllm  s   (((( >>X%)-)C)CG)T&NJ^^z))-)E)Eg)V&N  ?J??XbcXbQTs/#c(:;qcCHnLXbcWabWaPSn(9CH(DEKWa
b++66
UcdUccA3S>QC:C3H,IIUcd &
% %(G|$;C:@UC. [^^% 	 
 UccTbS*s3x2G HHTbc \\*5E5E5L5LM
ll;7G7G7N7NOn=M=M=T=TU,,?O?O?V?VWGG! dbd
 ds   6,H(H",H)HHc                    USLn[        U S5      (       aP  U R                  R                  U R                  :w  a,  U R	                  5         U R                  R                  U l        O<[        U S5      (       d+  U R	                  5         U R                  R                  U l        [        SUS   05      (       a,  U Vs/ s H  n[        SU0U R                  5      S   PM      nnOUn[        U5      nU(       a  [        U5      nU R                  R                  (       Ga  USSU R                  2   nU(       a  WSSU R                  2   n	OSn	U R                  R                  UU	U R                  U R                  U R                  U R                   U R"                  c  SOU R"                  U R$                  c  SOU R$                  U R&                  R(                  [        U S5      (       a  U R*                  OSU R,                  R.                  S9n
U
 VVs/ s H  o  H  o/PM     M     n
nnOS/[1        U5      S	-  -  n
[3        U
SS
9n
[5        U R                  R6                  [1        U5      -  S	-  U R                  R6                  S-   [1        U5      -  S	-  5      nX   n
U R                  USSSSS9n/ nUS    H2  nUR9                  UR;                  5       UR;                  5       /5        M4     X4$ s  snf s  snnf )z+Generate completions using vLLM server modeNr   r  r   rd   r   r   )rU  rV  r   r   r   r   r   r   r   r   r   r&   )from_processr0   ptTleftFtextreturn_tensorspaddingpadding_sider  r  )r   r  global_stepr   _move_model_to_vllmr(   r'   rO   r   r   r   r   r   generater   r   r   r   r   r  r   r   rK   r   r   r   slicer   extendtolist)r	  rU  rV  
has_imagespprompts_textall_prompts
all_imagesordered_set_of_promptsordered_set_of_imagesrW  prompt_completionscomp_idprocess_sliceprompt_inputsrX  prompt_tokenss                    r_   rR  &OnlineDPOTrainer._generate_vllm_server  s   4'
 4,--$**2H2HDLbLb2b$$&%)ZZ%;%;D"233$$&%)ZZ%;%;D" h
344ipqipde/1t?T?TUV^_ipLqL"L#L1&v.J+++ &11HD4H4H1H%I"(23Jd6J6J3J(K%(,%!--66.,&&#'#:#: ,,jj JJ.bDJJ!ZZ/cTZZ11<<DKDRiDjDjd&@&@pt"&))"="= 7 N CQs.,>`rU\i`ri.NsN"Vs;'7!';<N /~AN **S\9A=++a/3w<?!C
 (6 --$ . 
 
*;7M}335}7K7K7MNO 8))q r> ts   ;%K4K9c           	         U R                  5         [        SUS   05      (       a,  U Vs/ s H  n[        SU0U R                  5      S   PM      nnOUnUbC  / n[	        XB5       H1  u  pgUb  UR                  USU0S.5        M   UR                  U5        M3     OUnU R                  R                  XPR                  SS9n[        S5       V	V
s/ s H.  o  H%  n
[        U
R                  U	   R                  5      PM'     M0     nn	n
[        S5       VV
s/ s H!  o  H  n
[        U
R                  5      PM     M#     nnn
X4$ s  snf s  sn
n	f s  sn
nf )z-Generate completions using vLLM colocate moder  r   image)r  multi_modal_dataF)use_tqdmr&   )rh  r(   r'   rO   r   r   r   ri  r  ranger   outputs	token_idsprompt_token_ids)r	  rU  rV  rn  ro  vllm_inputsr  r{  r  r  outputrW  _rX  s                 r_   rS  (OnlineDPOTrainer._generate_vllm_colocate8  sL    	  " h
344ipqipde/1t?T?TUV^_ipLqL"L K!$\!:$&&&wX]N^'_`&&v.	 "; 'K((##K1G1GRW#XEJ1XdX\cRX$v~~a0::;\c;Xd=B1X\XT[&d6223T[3X
\))) r" e\s   %E5E(Ec                 	   U R                   R                  R                  nUSL=(       a    UR                  S:H  nU(       a  SSKnUR
                  R                  nO[        n[        U R                  5      (       Ga0  U" [        U R                  R                  5       5      5         U R                  R                  5         U R                  (       a}  [        U R                   R                  SS5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO]US:X  a  U R!                  U R                  5        GO:U R                  R#                  5        GH  u  pxUR%                  S5      R'                  S	S
5      nU R                  R(                  U;   a  MC  SU;   a  MK  U R+                  US/S9nU R,                  S:X  aB  U R                   R.                  (       a'  U R0                  R3                  XxR4                  5        M  U R,                  S:X  d  M  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        GM     U R                  RC                  5         SSS5        GOU R                  (       a}  [        U R                   R                  SS5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO%US:X  a  U R!                  U R                  5        GOU R                  R#                  5        H  u  pxU R+                  U5      nU" U/5         U R,                  S:X  aA  U R                   R.                  (       a&  U R0                  R3                  XxR4                  5        OkU R,                  S:X  a[  U R6                  R8                  R:                  R<                  R>                  R                  n	U	RA                  XxR4                  4/5        SSS5        M     U R,                  S:X  a6  U R                   R.                  (       a  U R0                  RE                  5         gU R,                  S:X  a  U R6                  RE                  5         gg! , (       d  f       N= f! , (       d  f       GM{  = f)zSSynchronize model weights to vLLM server with support for PEFT, DeepSpeed, and FSDPN   r   fsdp_pluginfsdp_versionr0   r&   zbase_model.model.z.base_layer original_modulezmodules_to_save.default.extra_prefixesr   r   )#r   r  deepspeed_plugin
zero_stage	deepspeedzeroGatheredParametersr   r   rG   r   
parametersmerge_adapteris_fsdp_enabledr   _sync_fsdp1_params_to_vllm_sync_fsdp2_params_to_vllmnamed_parametersremoveprefixreplaceprefix_fix_param_name_to_vllmr   r   r   update_named_paramr7  r   
llm_enginemodel_executordriver_workermodel_runnerload_weightsunmerge_adapterreset_prefix_cache)
r	  r  zero_stage_3r  gather_if_zero3r  r  nameparam	llm_models
             r_   rh  $OnlineDPOTrainer._move_model_to_vllmU  s     ++11BB't3X8H8S8SWX8X'nn??O)O$$ !djj&;&;&=!>?

((* '' #*$*:*:*@*@-QU"VKNY7;#J_`L#q(77

C%*77

C (,zz'B'B'D#001DEMMm]_`::,,4$,4$#;;DRlQm;n>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH (E  

**,A @?H ##%d&6&6&<&<mTRJUw{NAF[\1$33DJJ?!Q&33DJJ?#'::#>#>#@KD77=D(%1>>X5$:J:J:Z:Z ,,??jjQ!^^z9(,(;(;(J(J(X(X(e(e(k(kI%22T::4F3GH 21 $A >>X%$*:*:*J*J//1^^z)HH'') *q @?\ 21s!   'E?R0*A:R05B=S0
R>
S	moduler  c                 @   Uc
  [        5       nUR                  5        H%  u  pEU(       a  U SU 3OUnU R                  XVUS9  M'     [        U[        5      (       Ga1  [        R
                  " USSS9   UR                  5        H  u  pxU(       a  U SU 3OUn	U R                  U	S/S9n	X;   a  M-  UR                  U	5        U R                  S:X  aB  U R                  R                  (       a'  U R                  R                  XR                  5        M  U R                  S	:X  d  M  U R                  R                   R"                  R$                  R&                  R(                  n
U
R+                  XR                  4/5        M     SSS5        gg! , (       d  f       g= f)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nrj   )r  visitedF)recurse	writebackz_fsdp_wrapped_module.r  r   r   )setnamed_childrenr  r   FSDPsummon_full_paramsr  r  addr   r   r   r   r  r7  r   r  r  r  r  rG   r  )r	  r  r  r  
child_namechild_modulechild_prefix
param_namer  	full_namer  s              r_   r  +OnlineDPOTrainer._sync_fsdp1_params_to_vllm  s_    ?eG(.(=(=(?$J7=fXQzl3:L++7 ,  )@ fd##((%P)/)@)@)B%J<B6(!J< 8
I $ < <YXoWp < qI + KK	*~~1d6F6F6V6V((;;IzzR:5$(HH$7$7$F$F$T$T$a$a$g$g	!..JJ0G/HI *C QP $PPs   3B/F&AF
Fc                 4   UR                  5       R                  5        H  u  p#UR                  (       a%  UR                  [        R
                  " S5      5      nUR                  5       nU R                  S:X  a8  U R                  R                  (       a  U R                  R                  X#5        M  U R                  S:X  d  M  U R                  R                  R                  R                  R                   R"                  nUR%                  X#4/5        M     g )Nr   r   r   )
state_dictr  is_cpur  r   r   full_tensorr   r   r   r   r  r   r  r  r  r  rG   r  )r	  r  r  r  r  s        r_   r  +OnlineDPOTrainer._sync_fsdp2_params_to_vllm  s    !,,.446KD||f!56%%'E~~)d.>.>.N.N  33D@:- HH//>>LLYY__	&&7 7r  r  c                 ^    U=(       d    / nS/U-   nU H  nUR                  US5      nM     U$ )z,Clean parameter names for vLLM compatibilityz_checkpoint_wrapped_module.r  )r  )r	  r  r  prefixesr  s        r_   r  (OnlineDPOTrainer._fix_param_name_to_vllm  s8    '-212^CF<<+D r  featuresc                     U=(       d    U R                   nU" US   /US   SS9nUS   S   nUUS   S   S.nS	U;   a  US	   S   US	'   S
U;   a  US
   S   US
'   SU;   a  US   S   US'   U$ )z@
Process a vision row for VLM models (adapted from DPO trainer)
r{  r  F)rV  rc  r  r  r   r  )prompt_input_idsprompt_attention_maskpixel_valuespixel_attention_maskimage_sizes)rO   )r	  r  rO   	processorprocessed_featuresr  r  s          r_   process_vision_row#OnlineDPOTrainer.process_vision_row  s     %=(=(=	&x/@.AQYHZotu-k:1= !1%78H%I!%L
 //%7%G%JF>"!%77-?@V-WXY-ZF)*..$6}$Ea$HF=!r  c                 V   [        UR                  5       5      R                  nU R                  nU R                  nU Vs/ s H  nSU0PM	     nnUb  [        U5       H  u  pXU	   S'   M     U Vs/ s H  n[        XR                  5      S   PM     nnU R                  Gbn  UGbj  [        R                  " U R                  5      n[        U R                  S5      (       Ga.  U R                  R                  (       Ga  [        R                  " XR                  R                  5      (       a7  U Vs/ s H)  n[        R                  " SU S3U R                  U5      PM+     nnOU R                  bs  [        R                  " U R                  R                   R#                  U R                  /5      5      nU Vs/ s H!  n[        R                  " SU SU 3SU5      PM#     nnO,U Vs/ s H  n[        R                  " SU S3SU5      PM!     nn0 nUb  SU Vs/ s H  nU/PM     sn0nU R                  " S$US	S
SSS.UD6nUR%                  5        VVs0 s H  u  nnUUR'                  U5      _M     nnnSU;   aQ  [)        USS5      nUc'  [        US5      (       a  UR*                  R,                  nUb  US   R'                  U5      US'   US   R/                  SS5      nUS   R/                  SS5      n0 nU R0                  (       a}  Ubz  SU;   a  US   R/                  SSSS5      US'   SU;   a  US   R/                  SS5      US'   SU;   a  US   R/                  SS5      US'   SU;   a  US   R/                  SS5      US'   U R2                  (       Ga  U R4                  R6                  R8                  n[;        5       (       a  SU R4                  R6                  l        OSU R4                  R6                  l        [=        U S5         [?        XR@                  U RB                  RD                  S9 n[F        RH                  " 5          U RJ                  (       a  [L        RN                  " U R4                  SS9O	[Q        5          U RB                  RR                  (       a   UR'                  [F        RT                  5        O:U RB                  RV                  (       a  UR'                  [F        RX                  5        [F        RZ                  " 5          UR]                  UR_                  5       U R`                  SS9nSSS5        SSS5        SSS5        SSS5        SSS5        WRc                  5        Vs/ s H  nURd                  PM     nnU Vs/ s H  n[F        Rf                  " UUS9PM     nn[i        UU R                  SS 9n[F        Rj                  " UU/SS!9nUU R4                  R6                  l        URm                  S5      n USS2U S24   n[o        UXV5      u  nn!UUUU!4$ [=        U S"5         [?        XR@                  U RB                  RD                  S9 n[F        RH                  " 5          U RJ                  (       a  [L        RN                  " U R4                  SS9O	[Q        5          U RB                  Rp                  b%  U RB                  Rp                  UR`                  l8        URr                  " S$UUU R`                  S#.UD6nSSS5        SSS5        SSS5        SSS5        WSS2URm                  S5      S24   n[o        UXV5      u  nn!UUUU!4$ s  snf s  snf s  snf s  snf s  snf s  snf s  snnf ! , (       d  f       GNL= f! , (       d  f       GNV= f! , (       d  f       GN`= f! , (       d  f       GNj= f! , (       d  f       GNt= fs  snf s  snf ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f)%z$Generate completions using the modelr  Nr{  chat_template(z)+r  rV  r`  Tra  Frb  r  rg   r  r  r&   r0   r  r  r  image_grid_thwpaged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r  )r  progress_barr   right)padding_valuerf  dimztransformers.generate)r  r  r  r\   ):nextr  r   r   r|   r   r)   rO   r   reescaper   r  searchsubr{   r   r   r  r  r   r  rg   repeatr   r   model_wrappedr   _attn_implementationr#   r*   r/   r   rK   ds3_gather_for_generationr   no_gradr  r  r  r   r  bfloat16r  float16inference_modegenerate_batchrl  r  valuesgenerated_tokensr   r9   catsizer;   cache_implementationri  )"r	  rG   rU  rV  r   r   r|   r  inputsr  r{  r^   ro  escaped_img_tokenrc  escaped_eoi_tokenkwargsimgrw  r  r  model_dtyperX  rZ  vision_generation_kwargsprevious_attnunwrapped_modelall_outputsr  rW  rP  prompt_completion_idsprompt_lengthr[  s"                                     r_   	_generateOnlineDPOTrainer._generate  sg   e&&()00(((( 4;;78V$7; %f-%*q	'" . `ff_eZ[1!5J5JKHU_ef 'F,> "		$*:*: ;t,,o>>4CXCXCfCfCf99.0E0E0S0STT `l$_kW[!$5#6b94;K;KTR_k ! $L
 //;,.II 11;;BBDD\D\C]^-) iu(ht`dBFFa(9':"=N<O#PRTVZ[ht % (
 bn'namY]!4E3Fb/I2t(Tam'n & 9&3#& 9:F -- 
$
 
 6C5H5H5JK5JTQADDL5JK]*!%$7K"wuh'?'?#ll00&0=n0M0P0PQ\0]n- #;/66q!<
#$45<<QB $& F$6.;H;X;_;_`acdfgij;k(8%6CPQgChCoCopqstCu()?@-:G:V:]:]^_ab:c(7=0=JK[=\=c=cdegh=i()9:&&& ..55JJM(**AR""))>AM""))>!$(EF+++diiFiFi$NRNbNb''(:(:EJhshuu 99>>#&&u~~6YY^^#&&u}}5))+"1"@"@"))+*.*@*@%* #A #K , v   G$ EPDVDVDXYDX&f55DXNYJXY.3ell3v>.NY t?P?P_fgN$)IIz>.JPQ$R!=JD%%: 'OOA.M21mn3DEN.<^\.h+NO{NOKK "$(?@+++diiFiFi$NRNbNb''(:(:EJhshuu 9911=MQYYMkMkO55J )11 (#.&*&<&< /	 v   A& $Azq'9';$;<N.<^\.h+NO{NOKK[ < g$(
 (o
 !: L\ ,+ vu    GF$ ZY( vu    A@s    ]2"]$30]);(].*&]3]8]=)_8^9;^'	B^	*^>^	^'^9_8__"9)`"`	8;_83A_'	_8`	 `
^^	
^$^''
^61^99
_	_
_'
_51_88
``		
`	`
`(c           	         U R                   R                  n[        R                  " [	        U5      [	        U R
                  5      US9nU R                  US'   [        [        U R
                  U R                  5      5       GH{  u  nu  p[        U[        R                  5      (       a  [        SUS   05      (       aB  [        X5       V
Vs/ s H  u  pSX-   0PM     nn
nU Vs/ s H  n[        X5      S   PM     nnO![        X5       V
Vs/ s H	  u  pX-   PM     nn
nU	" USSS	S
S9nUR                  5        VVs0 s H  u  nnUUR!                  U5      _M     nnn[        R"                  " 5          U" S0 UD6R$                  SS2S4   USS2U4'   SSS5        GM   U" SXUS.UD6nU Vs/ s H  nUb  UO[        R&                  PM     nn[        R(                  " U[        R*                  US9USS2U4'   GM~     U R,                  b;  X`R,                  R!                  U5      R/                  S5      -  R1                  SS9nU$ UR1                  SS9nU$ s  snn
f s  snf s  snn
f s  snnf ! , (       d  f       GM   = fs  snf )z*
Calculate rewards using reward functions
r   trainer_stater  r   messagesrc  r`  Tr  Frb  N)rU  completionsrW  )rg   r   r0   r  r\   )r   r   r   zerosr   rI   r  r   r   rP   r   r   r   r(   r'   r  r  r  logitsnanr   r   r   	unsqueezenansum)r	  rU  r  completion_ids_listreward_kwargsr   rewards_per_funcr  r  rW   rn  cr  r^   textsreward_inputsr  r  output_reward_funcrewardtotal_rewardss                        r_   !_calculate_rewards_from_functions2OnlineDPOTrainer._calculate_rewards_from_functions  s}    !!(( ;;s7|S9J9J5KTZ[ *.o&9B!!4#A#AB:
5A5 +ryy11$h
%;<<@CG@YZ@YQU 3@YHZ^fg^fYZ0LVT^fEgE/27/HI/HtqQU/HEI !8tTPWlq! >K=P=P=R S=RTQADDL=R S))+-8-I=-I-P-PQRTUQU-V$QT* ,+ &1 &#M`&dq&" as%s`rV\0Bf		&Q`r"%s).6HPUP]P]fl)m A&5:
: *-0C0C0F0Fv0N0X0XYZ0[[cchicjM  -333:M9  [gI !T++ &ts*   I%I I%>I+5!I10J1
J	c                 v   [        UR                  S5      UR                  S5      -   U R                  -
  S5      nUS S 2US 24   nUS S 2US 24   n[        R                  " X$4SS9n[        R                  " X54SS9n	SU	0n
Ub8  SU;   a  US   U
S'   SU;   a  US   U
S'   SU;   a  US   U
S'   SU;   a  US   U
S'   U" U40 U
D6nUR                  S5      nUS:  a  US-
  OSnUR
                  S S 2US	24   n[        R                  " UR                  S	S9UR                  S	5      S
S9R                  S	5      nU$ )Nr0   r   r  r  r  r  r  r  rd   r&   )
rT  r  r   r   r  r   take_along_dimlog_softmaxr  squeeze)r	  rG   rX  rZ  rW  r[  vision_inputsnum_tokens_to_truncater  prompt_completion_maskmodel_kwargsr  
prompt_len	start_idxr   logprobss                   r_   _forwardOnlineDPOTrainer._forward  s   !$Z__Q%7.:M:Ma:P%PSWSbSb%bde!f  #9#: :;
!!%;%<"<= !&		:*FA N!&K+Iq!Q )*@A$./<^/L^,%67DE[7\34-.;M.J]+=01>?O1P-. ,==  __Q'
&01nJN!	q)B,/ ''(:(:r(:(BND\D\]_D`fghppqstr  r  num_items_in_batchc                    UR                  5         US   n[        U5      nSU;   nS nU(       a  US   nU H  n[        U[        5      (       d  M  U Hx  n	[        U	[        5      (       d  M  U	R                  S5      n
U	R                  S5      n[        U
[        5      (       d  MS  US:X  a  SS0SU
S./U	S'   Mg  US	:X  d  Mo  SU
S./U	S'   Mz     M     U R                  R                  (       a  U R                  XG5      u  ppOU R                  XU5      u  pp[        R                  " XR                  :H  S
S9nS nU(       Gaw  U R                  (       Gae  U R                  R                  (       GdI  0 nSU Vs/ s H  nU/PM     sn0nU R                  " S5S/[        U5      -  SS.UD6n[!        USS 5      n[!        USS 5      nUc=  [#        US5      (       a,  UR$                  R&                  nUR$                  R(                  nSU;   a(  US   R+                  UUS9R-                  SSSS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   U R/                  XXUU5      n[        R0                  " 5          U R2                  b  U R/                  U R2                  XXU5      nOAU R4                  R7                  5          U R/                  U R4                  XXU5      nS S S 5        S S S 5        UR&                  nU R                  R9                  USS9n[;        SUS   05      (       a  U Vs/ s H	  nSUS./PM     nnU R<                  b  [?        UR@                  S   5       Vs/ s H  nUU   RC                  5       PM     nn0 nU Vs/ s H  nUS;  d  M  UPM     n nU  H6  n[        UU   [        [D        45      (       a  UU   S-  UU'   M.  UU   UU'   M8     U RF                  " S5SU-  UUS .UD6n!U R                  RH                  b"  U!U) ==   U R                  RH                  -  ss'   U!RK                  U5      u  n"n#U"U#:  n$OU RL                  b  [;        SUS   05      (       ah  [N        RP                  " 5       n%U%RS                  [T        5      n&U Vs/ s H  nU&RW                  US!9PM     nnU Vs/ s H  nU&RW                  US!9PM     nnU RL                  RM                  U[        [Y        US U UUS  5      5      5      n'[        RZ                  " U' V(s/ s H  n(U(S:H  PM
     sn(US"9n$[        R\                  " UUS"9n)U)W$) U-  -   n*U)U$U-  -   n+[        R^                  " U*U+4SS9n,UU,   n-WU,   n.URa                  5       ) n/U/U,   n0U-U0) -  Rc                  S5      n1U.U0) -  Rc                  S5      n2[        RJ                  " U1U5      u  n3n4[        RJ                  " U2U5      u  n5n6U3U4-
  n7U5U6-
  n8U7U8-
  n9U R                  Rd                  S#:X  a%  [f        Rh                  " U Rj                  U9-  5      * n:OKU R                  Rd                  S$:X  a  U9SSU Rj                  -  -  -
  S-  n:O[m        S%U Rd                   35      eU:Ro                  5       n;U R<                  b  W!U*   U!U+   -
  n<U Rp                  S&   Rs                  U Rt                  Rw                  U<Ro                  5       5      Ro                  5       Ry                  5       5        U Rp                  S'   Rs                  U Rt                  Rw                  U!Ro                  5       5      Ro                  5       Ry                  5       5        U Rp                  S(   Rs                  UR{                  5       Ro                  5       Ry                  5       5        U Rp                  S)   Rs                  U Rt                  Rw                  U35      Ro                  5       Ry                  5       5        U Rp                  S*   Rs                  U Rt                  Rw                  U45      Ro                  5       Ry                  5       5        UU-
  n=U=Rc                  S5      Ro                  5       n>U Rp                  S+   Rs                  U Rt                  Rw                  U>5      Ro                  5       Ry                  5       5        U Rj                  * U=-  Rc                  S5      n?U?Ro                  5       n@U Rp                  S,   Rs                  U Rt                  Rw                  U@5      Ro                  5       Ry                  5       5        U R<                  bX  W!U?-   nAU Rp                  S-   Rs                  U Rt                  Rw                  UA5      Ro                  5       Ry                  5       5        URc                  S5      Ro                  5       * nBU Rp                  S.   Rs                  U Rt                  Rw                  UB5      Ro                  5       Ry                  5       5        U Rj                  U3U5-
  -  nCU Rt                  Rw                  UC5      nDU Rp                  S/   Rs                  UDRo                  5       Ry                  5       5        U Rj                  U4U6-
  -  nEU Rt                  Rw                  UE5      nFU Rp                  S0   Rs                  UFRo                  5       Ry                  5       5        UDUF-
  nGU Rp                  S1   Rs                  UGRo                  5       Ry                  5       5        UGS:  nHU Rp                  S2   Rs                  UHR{                  5       Ro                  5       Ry                  5       5        U Rp                  S3   Rs                  U Rj                  5        U R                  R|                  b;  U R~                  R                  U R                  R|                  -  S:X  a
  [        5         0 nU R                  R                  [        R                  [        R                  4;   a  U R                  5       US4'   U R                  R                  S:  a  U;Ro                  5       n;U R                  (       a;  [        R                  " U;U R                  5       nIUIR                  5         S S S 5        OU Rt                  R                  " U;40 UD6  U;R                  5       U R                  R                  -  $ s  snf ! , (       d  f       G
N7= f! , (       d  f       G
NA= fs  snf s  snf s  snf s  snf s  snf s  sn(f ! , (       d  f       N|= f)6Nr  r{  contentroleusertyperc  )r!  rc  systemrd   r  rV  r  r`  )rc  rd  r   rg   r  r  rf   r&   r0   r  r  r  T)skip_special_tokensr   	assistant)r  r  )r  )rU  r  r  )r  r   sigmoidipozinvalid loss type rw   rx   rt   rr   rs   rk   rm   rv   rl   rn   ro   rq   rp   ru   learning_rater\   )Otrainr   r   r   dictr   r   rK   r   r\  r  r   anyr   r   rO   r   r   r  r   rg   r  r  r  r  rH   rG   disable_adapterbatch_decoder(   rI   r~  shaperl  tupler  r   r   rJ   jinja2Environmentfrom_stringr3   renderr   r   aranger  boolr   	loss_typeF
logsigmoidru   NotImplementedErrormeanr   r   r   gather_for_metricsitemfloattorch_empty_cache_stepsr  rg  r6   optimr"   LOMOADALOMO_get_learning_raten_gpuuse_apexr>   
scale_loss	optimizerbackwarddetachgradient_accumulation_steps)Jr	  rG   r  r  rU  r)  rm  rV  r  messager  r  rX  rZ  rW  r[  contain_eos_tokenr  r  r  	processedmodel_devicer  r  ref_logprobsr   r  
completionr  r  r  r#  r   rewards
first_halfsecond_halfmaskenvironmenttemplateranks_of_first_completionrankbatch_rangechosen_indicesrejected_indices
cr_indicescr_logprobscr_ref_logprobspadding_maskcr_padding_maskcr_logprobs_sumcr_ref_logprobs_sumchosen_logprobs_sumrejected_logprobs_sumchosen_ref_logprobs_sumrejected_ref_logprobs_sumpi_logratiosref_logratiosr   losseslossscores_marginklmean_klnon_score_rewardmean_non_score_rewardrlhf_rewardmean_entropychosen_rewardsgathered_chosen_rewardsrejected_rewardsgathered_rejected_rewardsmarginaccuracyscaled_losssJ                                                                             r_   training_stepOnlineDPOTrainer.training_step  sJ    	"\
 &
G_F!fd++#))'488$")++i"8&{{62%gs33#v~7=w6GRXbiIj5k	 2!%!1?Ew6W5X	 2 $* " 99GKGZGZ[bGkDJ^_GK~~V[flGmDJ^!IIn8I8I&IrR $...tyy7I7I7I M& 9&3#& 9:F-- TCK'# I #5(D9L!%$7K#x(@(@$||22#ll00 *n-00[0QXXYZ\]_`bcd n- &28ABX8Y8\8\]i8j8q8qrsuv8w45	)/8/G/J/J</X/_/_`acd/em,9,2;<L2M2P2PQ]2^2e2efgij2k./==KQ`bop]]_~~)#}}NNJ^^k  ZZ//1#'==

J^^k$L 2  ++88]a8bh
344\gh\gj[ZHI\gKh (GL^MaMabcMdGe"fGe!>!#4#;#;#=Ge"f M#)C6CS
-BC6DCfSkD%=99)/qM#&)/M#&  << GReivG
 yy,,8**+tyy/L/LL+ '.mmJ&?#J,DZZ#
 !(GAJ!788$002&223GHJQR'8??F?;'RVabVa
x
CVab(,

(8(8c+kz":K
<TUV)% <<7P Q7Pt7P QZ`aDll:f=$
(:;&$*;< YY0@AqI
z*&z2 (,,..&z2&/)99>>qA./1AAFFqI 6;[[R\5]22=B[[I\^h=i:!:*-BB/2KK-99)+ll499v#566FYY  E)qA		M22q8F%(:4>>:J&KLL{{} (#N3g>N6OOMJJ0188  33M4F4F4HINNPUUW JJ)*11$2B2B2U2UV]VbVbVd2e2j2j2l2q2q2st

*+223D3J3J3L3Q3Q3S3X3X3Z[

>"))$*:*:*M*MNa*b*g*g*i*n*n*pq

#$++D,<,<,O,OPe,f,k,k,m,r,r,tu$&&).."

>"))$*:*:*M*Mg*V*[*[*]*b*b*de!YYJO003 0 5 5 7

/077//0EFKKMRRT	
 (!$44KJJ./66t7G7G7Z7Z[f7g7l7l7n7s7s7uv Q,,..

&'..t/?/?/R/RS_/`/e/e/g/l/l/no&9<S&ST"&"2"2"E"En"U

#$++,C,H,H,J,O,O,QR99(=@Y(YZ$($4$4$G$GHX$Y!

%&--.G.L.L.N.S.S.UV(+DD

$%,,V[[]-?-?-ABA:

'(//0@0E0E0G0L0L0NO

6!!$)), II--9

&&)J)JJaOM 99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5$$& 65 %%d5f5{{}tyyDDDDo !:@ 21 _ i
 #g D8 Sb !Rt 65sa   7u3"Av
)u8v
vv!5
v&v&v+"v0v5v:8
v	v


v:
wc	                     U R                   R                  (       Ga  U R                  R                  U R                  :  Gat  0 n	U R                  U5      R                  5       R                  5       n
X-  n[        XR                  R                  U R                  -
  -  S5      U	S'   UbB  [        U[        R                  5      (       a  UR                  5       R                  5       OUU	S'   Ub  XS'   OU R                  5       U	S'   U R                  R                  5        H  u  p[!        U5      [#        U5      -  X'   M      U R                   Vs0 s H  o/ _M     snU l        U =R$                  U
-  sl        U R                  R                  U l        U R'                  5         U R)                  X5        S nU R                   R*                  (       aJ  U R-                  XF5      nU R/                  XS9nU R0                  R2                  S:X  a  XR                   l        U R                   R4                  (       aR  U R7                  X45        U R8                  R;                  U R0                  U R                  U R                   5      U l         g g s  snf )N   rh  	grad_normr'  )metricstrialbest)control
should_logr  rg  _globalstep_last_logged_nested_gatherr9  r;  roundr   r   TensorrG  rA  r   r  r   r   _total_loss_scalar
store_floslogshould_evaluate	_evaluate_determine_best_metricrK   save_strategyshould_save_save_checkpointcallback_handleron_save)r	  tr_lossr{  rG   r}  r  ignore_keys_for_eval
start_timer'  logstr_loss_scalarr#  valr|  is_new_best_metrics                  r_   _maybe_log_save_evaluate)OnlineDPOTrainer._maybe_log_save_evaluate  s    <<"""tzz'='=@\@\'\%'D "009>>@EEGN G ::3I3IDLhLh3h!iklmDL$AKIW\WcWcAdAdI$4$4$6$;$;$=js[!((5_%(,(?(?(A_% !JJ,,.Hs3x/	 /-1ZZ8Zcr'Z8DJ##~5#+/::+A+AD(OOHHT&<<''nnUAG!%!<!<W!<!Zyy&&&0+=(<<##!!%/0088DJJPTP\P\]DL $ 9s   Jc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nrc   rd   )
model_name)	rK   hub_model_idr   
output_dirr  r   create_model_cardr   r  )r	  rG   r}  r  r  s       r_   r  !OnlineDPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .r  r  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr   unsloth_versionunslothJOB_IDhf_jobsa          @article{guo2024direct,
            title        = {{Direct Language Model Alignment from Online AI Feedback}},
            author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
            year         = 2024,
            eprint       = {arXiv:2402.04792}
        }z
Online DPOz7Direct Language Model Alignment from Online AI Feedbackz
2402.04792)rI  r  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror   rG   r   r   pathisdirr   r  r   r   r  r   r   r   textwrapdedentr7   r  r   wandbrunurlr8   savejoinrK   r  )r	  r  r  r  rI  citation
model_cards          r_   r  "OnlineDPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%%Q!

 	TYY%9%9;GHr  )&r   r@  r  r   r  r   r  r   r   ry   r   r   rJ   r   r   r   r   r   r|   rH   r   r   rI   rP   r   r   r   r   r   r   r   r{   rz   r   r   r   r   r   )NNNNNNNNNNNN)NNNNNr[   )r  N)NNN)?r   
__module____qualname____firstlineno____doc__r   r	   r   r   r   r   r   
RewardFuncr   r1   r2   r   r   r   r)  r   r   r   r    r   r.  r   r>  	Optimizerlr_schedulerLambdaLRr  r   propertyru   staticmethodr4  r   r%  r   r   r=  r   rE  r   r\  rR  rS  rh  r  r  r  intr  r  r  r  rw  r  r  r  __static_attributes____classcell__)r  s   @r_   rD   rD   m   sq   CJ &J
 >BFJ-1*.04CGnrUYmq.2FJ59VbhlDHEI'd_bii45d "))T9:d uZj1A%ABC	d
 )*d 'd  -d  g&> ?@d uWotCwXgOgIhDh?i%ijkd #5)@.)P#QRd $,E2I4PgKh2h,i#jd l+d "(N+;T+A"BCd D12d %++//1I1I1R1RRSd  (0%,,9UW\WcWc9c0d'e!d$ u_bii%?@A%d& "**A!B'd( 
)d dL   $ CZ _cdgildl_m    7''(Xj X )X. 7&&'-9sG|9L0M -9Yc -9 (-9^O ? _n ,H<F*P*:I*VJ JC J68 8HT#Y<O  PTS%ell(:";;<	c49n	6TLl,\"J rv_EYY_E(,S%c8I2J-J(K_Eaijman_E	_EF hl(^V/ %)&*,0	?ISM?I sm?I CcD()	?I ?Ir  rD   )sr   r  r  r   
contextlibr   	functoolsr   pathlibr   typingr   r   r   r	   r/  r   torch.nnr   torch.nn.functional
functionalr6  torch.utils.data
accelerater
   accelerate.utilsr   r   r   datasetsr   	packagingr   torch.distributed.fsdpr   r  r   r   transformersr   r   r   r   r   r   r   r   r   r   r   r   &transformers.models.auto.modeling_autor   transformers.trainer_utilsr    r!   transformers.training_argsr"   transformers.utilsr#   r$   r%   
data_utilsr'   r(   r)   extras.profilingr*   extras.vllm_clientr+   import_utilsr,   modelsr-   r.   models.utilsr/   judgesr1   online_dpo_configr2   r6  r3   r4   r5   r6   r7   r8   r9   r:   r;   peftr<   r=   apexr>   smdistributed.modelparallelr?   SMP_VERSIONparseIS_SAGEMAKER_MP_POST_1_10vllmr@   rA   vllm.sampling_paramsrB   r  
get_loggerr   r   r   r   r<  r  rD   r\   r  r_   <module>r     sO   
 	   "   1 1        P P   C 8    ^ B 5  [ Z 0 + , ? 6 % .
 
 
 * F 'k :gmmF>S S !& (9			H	% 34,U2K)LLM
zIw zIr  