
    Y:i             
       \   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJrJ r J
r
J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)JrJ*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;JrJ<r<J=r=J>r>J?r?J@r@JArAJBrBJCrCJDrDJrJErEJFrFJGrGJHrHJIrIJrJ+r+J3r3JBrBJrJrJ5r5JrJ<r<J=r=JBrBJGrGJrJ
r
J#r#J(r(J9r9J=r=JBrBJrJrJrJrJ=r=JBrBJrJrJrJBrBJr  SSK=r=SSK7  SSKJJKrKJLrL  SS	KMJNrN  SSKrSSKOrPSS
KQJ<r<  SSKJr  SSKRJSrSJTrU  SSKVJWrW  SSKXrXSSKYJZrZ  S r[ SSSSSS.r\\R                  " SS\\S9S 5       r^S\R                  S\_S\_S\R                  4S jr`S\R                  S\R                  S\_S\_S\R                  4
S jraS\R                  S\_S\R                  4S jrbS  rc\K " S! S"\5      5       rd  " S# S$\(5      re " S% S&\e5      rf \g" \9S'5      (       a3  SSK:r: " S( S)\:R                  5      ri \9R                  " \i" S*5      5        gg)+z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)^r   AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizerBasePairwiseJudger   DPODataCollatorWithPaddingDataCollator
DataLoaderDatasetEvalPredictionFFSDPGenerationConfigIterableDataset*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESOnlineDPOConfigOnlineDPOTrainerOptimizerNamesr   Path
PeftConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixin
RewardFuncSIMPLE_CHAT_TEMPLATETrainerTrainerCallbackr	   
VLLMClientapply_chat_templatebroadcast_object_listcreate_reference_modeldisable_dropout_in_modelempty_cachegather_objectgenerate_model_cardget_comet_experiment_urlis_conversationalis_flash_attn_2_availableis_peft_modelis_vllm_availableis_wandb_availablejinja2loggerloggingmaybe_apply_chat_templatennnullcontextospadprepare_deepspeedprepare_peft_modelprofiling_contextreseed_workertextwraptorchtruncate_rightunwrap_model_for_generationversionwarningswrapsr   r(   r0   r@   r   r   r2   r9   r:   r;   r@   rF   r   r   r    r%   r6   r;   r@   rC   r   r   r9   r;   r@   r   r   r9   r@   rC   )*)	dataclassfield)Version)r:   )DataCollatorForSeq2SeqDataCollatorForLanguageModeling)ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrrS   rT   rU   )selfargskwargsoutputfs       N/home/james-whalen/llama.cpp/unsloth_compiled_cache/UnslothOnlineDPOTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolsrH   )r[   r]   s   ` r\   prepare_for_training_modera   /   s%    __Q  Nr_   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rl   indexrl      )rC   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrm   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             r\   chunked_selective_log_softmaxr   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYr_   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
ro   z8logits_to_keep must be smaller than the sequence length.Nrn   )rr   
ValueErrorsum)r   r   r   prompt_sectionpadding_maskpad_token_countss         r\   calculate_pad_tokens_in_promptr   W   sX     ++STTq"2N?"223N"2L#''A'.r_   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
devicer   ro   )rr   r   rC   arangerw   )r   r   r   r   
batch_sizecompletion_lenr   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               r\    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.Jr_   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
ro   T)rl   
descendingstable)rC   argsortrv   )r   r   masksorted_indicespacked_tensors        r\   left_pack_paddingr      s8     D]]4Q4MNLLN;Mr_   c                  .    SSK Jn  U" S0 U D6nXl        U$ )Nr   )SamplingParams )vllmr   _set_kwargs)rY   r   sampling_paramss      r\   vLLMSamplingParamsr      s    #$.v.O"(r_   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'   \" SSS0S9r\\   \	S'   SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS SSS!SSSSSSSSS"S"SSSSS#S$SSSSSSSS%SSSSSSSSSSSSSS%SSSSSSS&S'SSSS(SSSSSSSSSSSS)SSSSSSSSS$SSSS%SSSS*S+SSSSSSSSSSSSSSSS,S-SSSSS0 SSSS.SSS/SS0S1SS2S3S4S SSSSSSSS4U 4S5 jjrS6rU =r$ )7UnslothOnlineDPOConfig   u$"  
    
Configuration class for the [`OnlineDPOTrainer`].

This class includes only the parameters that are specific to Online DPO training. For a full list of training
arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
class may differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    reward_model_path (`str` or `None`, *optional*, defaults to `None`):
        Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
    judge (`str` or `None`, *optional*, defaults to `None`):
        Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
    max_new_tokens (`int`, *optional*, defaults to `64`):
        Maximum number of tokens to generate per completion.
    max_length (`int`, *optional*, defaults to `256`):
        Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If the
        sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as
        possible.
    temperature (`float`, *optional*, defaults to `0.9`):
        Temperature for sampling. The higher the temperature, the more random the completions.
    missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
        Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to
        generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
        value. This parameter only works when using `reward_funcs` and not when using `judge`.
    beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
        Parameter controlling the deviation from the reference model. Higher β means less deviation from the
        reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
        the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
        selected for each new epoch and the last β is used for the rest of the epochs.
    loss_type (`str`, *optional*, defaults to `"sigmoid"`):
        Type of loss to use. Possible values are:

            - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
            - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.

    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of processes to use for processing the dataset.
    disable_dropout (`bool`, *optional*, defaults to `True`):
        Whether to disable dropout in the model and reference model.

    > Parameters that control generation

    top_p (`float`, *optional*, defaults to `1.0`):
        Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to
        `1.0` to consider all tokens.
    top_k (`int` or `None`, *optional*, defaults to `None`):
        Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is
        disabled and all tokens are considered.
    min_p (`float` or `None`, *optional*, defaults to `None`):
        Minimum token probability, which will be scaled by the probability of the most likely token. It must be a
        value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range.
    repetition_penalty (`float`, *optional*, defaults to `1.0`):
        Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far.
        Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat
        tokens.
    use_transformers_paged (`bool`, *optional*, defaults to `False`):
        Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers`
        paged implementation will be used for generation instead of the default padded implementation. This
        parameter is only effective when `use_vllm` is set to `False`.
    cache_implementation (`str` or `None`, *optional*, defaults to `None`):
        Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
    generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if
        using vLLM) when sampling completions. This can be used to further customize the generation behavior, such
        as setting `supress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation
        parameters (like `min_p`, `top_p`, etc.), they will override them.

    > Parameters that control generation acceleration powered by vLLM

    use_vllm (`bool`, *optional*, defaults to `False`):
        Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation
        instead of the default model.generate(). Requires `vllm` to be installed.
    vllm_model_impl (`str`, *optional*, defaults to `"vllm"`):
        Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use
        the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model
        implementation.
    vllm_mode (`str`, *optional*, defaults to `"server"`):
        Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or
        `"colocate"`.

        - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM
          server is running (start with `trl vllm-serve`).
        - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a
          separate server but may cause resource contention with training.
    vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
        Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.

    > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`)

    vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`):
        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and
        `vllm_server_port` are ignored.
    vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`):
        Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_port (`int`, *optional*, defaults to `8000`):
        Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided.
    vllm_server_timeout (`float`, *optional*, defaults to `240.0`):
        Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the
        timeout, a `ConnectionError` is raised.

    > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`)

    vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.55`):
        Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_gpu_memory_utilization` flag.
    vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
        Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to
        `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when
        launching the vLLM server via the `--vllm_tensor_parallel_size` flag.

    > Other parameters

    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
        improving generation speed. However, disabling this option allows training models that exceed the VRAM
        capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
        with vLLM generation.
    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
        string.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsri   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksz'Maximum sequence length to truncate to.max_seq_lengthFnorj      r      g-C6
?g{Gz?g?g+?g:0yE>g      ?g      @linear皙?passivewarningTstepsro     iO  O1auto         
adamw_8bitlength
every_savelasti  @      sigmoidr   g?colocatez0.0.0.0i@  g      n@c                   > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#Wc$  S
SKJn  [        [	        U" 5       S-   S5      S5      nWS
::  a  [        S5      eWS:  a  [        S5      e[        TU ]  " S0 SU_SU_SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_S U_S!U_S"U_S#U_S$U_S%U_S&U_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U _S2U!_S3U"_S4U#_S5U$_S6U%_S7U&_S8U'_S9U(_S:U)_S;U*_S<U+_S=U,_S>U-_S?U._S@U/_SAU0_SBU1_SCU2_SDU3_SEU4_SFU5_SGU6_SHU7_SIU8_SJU9_SKU:_SLU;_SMU<_SNU=_SOU>_SPU?_SQW@_SRWA_SSWB_STWC_SUWD_SVWE_SWWF_SXWG_SYWH_SZWI_S[WJ_S\WK_S]WL_S^WM_S_WN_S`WO_SaWP_SbWQ_ScWR_SdWS_SeWT_SfWU_SgWV_ShWW_SiWX_SjWY_SkWZ_SlW[_SmW\_SnW]_SoW^_SpW__SqW`_SrWa_SsWb_StWc_SuWd_SvWe_SwWf_SxWg_SyWh_SzWi_S{Wj_S|Wk_S}Wl_S~Wm_SWn_SWo_SWp_SWq_SWr_SWs_SWt_SWu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l	        WU l
        g )NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!ro   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!r   r   unsloth_training_checkpointsr   r   )	cpu_countrj   r   r   zUUnsloth: Please set a positive non-zero temperature since your results will be wrong.
   zgUnsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16fp16fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesreward_model_pathjudgemax_new_tokens
max_lengthtemperaturetop_ptop_kmin_prepetition_penaltygeneration_kwargsuse_transformers_pagedcache_implementationmissing_eos_penalty	loss_typedisable_dropoutuse_vllmvllm_model_implvllm_guided_decoding_regexvllm_gpu_memory_utilization	vllm_modevllm_server_base_urlvllm_server_hostvllm_server_portvllm_server_timeoutvllm_tensor_parallel_sizeds3_gather_for_generationmodel_init_kwargsreward_weightsdataset_num_procgpu_memory_utilizationr   )printmultiprocessingr   minmax	MathErrorsuper__init__r   r   r   )rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rN  rO  rP  rQ  rR  rS  rT  rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rh  ri  rj  rk  rl  rm  rn  ro  rp  rq  r   r   r   rY   r   	__class__s                                                                                                                                                                        r\   rx  UnslothOnlineDPOConfig.__init__!  sr	   L 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M#1"3y{1}a#8"=!sttB  F  G  G 	 _	F#_	F#7_	F  _	F 	_	F
 $_	F *_	F $8_	F +F_	F *D_	F (@_	F '>_	F +F_	F '>_	F $_	F '>_	F  *!_	F" (#_	F$ $%_	F& $'_	F( ()_	F* *+_	F,  0-_	F. "/_	F0 !21_	F2 (3_	F4 (5_	F6 "7_	F8 !29_	F:  0;_	F< &=_	F>  0?_	F@ "4A_	FB *C_	FD &<E_	FF *G_	FH $I_	FJ  0K_	FL  0M_	FN !2O_	FP .Q_	FR 7^S_	FT U_	FV W_	FX ,Y_	FZ [_	F\ "]_	F^ *__	F`  a_	Fb c_	Fd e_	Ff ,g_	Fh &<i_	Fj ,k_	Fl ,m_	Fn o_	Fp $q_	Fr &s_	Ft *u_	Fv !2w_	Fx y_	Fz $8{_	F| $}_	F~ &<_	F@ *DA_	FB $C_	FD  E_	FF (G_	FH %:I_	FJ &K_	FL &<M_	FN %:O_	FP !2Q_	FR  0S_	FT U_	FV #6W_	FX &Y_	FZ 2T[_	F\ "4]_	F^ "4__	F` "a_	Fb &<c_	Fd e_	Ff $g_	Fh "i_	Fj .k_	Fl "4m_	Fn "o_	Fp *Dq_	Fr !2s_	Ft %:u_	Fv %:w_	Fx -Jy_	Fz #6{_	F| *D}_	F~ &_	F@ &<A_	FB (C_	FD (E_	FF "G_	FH  0I_	FJ .K_	FL (M_	FN &<O_	FP -JQ_	FR *DS_	FT &<U_	FV (W_	FX $8Y_	FZ (@[_	F\ !2]_	F^ *__	F` $8a_	Fb  0c_	Fd &e_	Ff "g_	Fh &i_	Fj *k_	Fl %:m_	Fn "4o_	Fp )Bq_	Fr -Js_	Ft #6u_	Fv $8w_	Fx "4y_	Fz *{_	F|  0}_	F~ #6_	F@ &<A_	FB -JC_	FD !2E_	FF G_	FH ,I_	FJ $K_	FL &M_	FN O_	FP Q_	FR S_	FT "4U_	FV !2W_	FX &<Y_	FZ $8[_	F\ #6]_	F^ "__	F` .a_	Fb  c_	Fd .e_	Ff *Dg_	Fh +Fi_	Fj "k_	Fl $8m_	Fn  0o_	Fp  0q_	Fr #6s_	Ft )Bu_	Fv )Bw_	Fx !2y_	Fz ,{_	F|  0}_	F~ &<f_	F@ %9!"4,r_   )r   r   r   )__name__
__module____qualname____firstlineno____doc__rK   r   r   r   __annotations__r   intr   rx  __static_attributes____classcell__ry  s   @r\   r   r      sj   @ +012+(3-  */VW*#  &+EF&NXc]  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) $!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(,  !&#" %)&*#$#$%$( !%#GV- V-r_   r   c            %         ^  \ rS rSrSrSS/r                S:S\\\R                  \
4   S\\\R                  S4   S\\\\\   4      S	\\   S
\\   S\\   S\\\\4      S\\\\\\
\\\4   4   4      S\\\\4      S\\\\\   4      S\S   S\\\/\4      S\\\      S\\R6                  R8                  \R6                  R:                  R<                  4   S\\\R>                  \R>                  /\R>                  4      S\\\\R                  4      S\\   SS4$U 4S jjjr \!S 5       r"\#S\$S\S\\
\%4   4S j5       r&\'" \(RR                  5      S\*4S j5       r)\'" \(RV                  5      S;S\\\
\4      S\*4S jj5       r+S\S
\S\4S  jr,S;S! jr-S;S" jr.S;S# jr/S$ r0S<S%\R                  S&\
4S' jjr1S%\R                  4S( jr2S;S)\\\
      4S* jjr3 S;S+\\
\\\R>                  4   4   S\\
\\4   4   4S, jjr5S;S- jr6S. r7S;S/ jr8 S;S\R                  S0\\
\\R>                  \%4   4   S1\\4   S\R>                  4S2 jjr9 S;S3 jr:U 4S4 jr;   S=S5\\
   S6\\
   S7\\
\\
   S4   4S8 jjr<S9r=U =r>$ )>_UnslothOnlineDPOTraineriz  af  
Initialize OnlineDPOTrainer.

Args:
    model (`Union[str, nn.Module, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
        The reference model to use for training. If None is specified, the reference model will be created from the
        model.
    judge (`BasePairwiseJudge`):
        The judge to use for pairwise comparison of model completions.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`, *optional*, defaults to `None`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
          custom callable function.
        - A list of reward functions: Must all be of compatible types.

        Note: Only one of `judge`, or `reward_funcs` should be provided.
    args (`OnlineDPOConfig`):
        The online DPO config arguments to use for training.
    data_collator (`transformers.DataCollator`):
        The data collator to use for training. If None is specified, the default data collator
        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
        sequences in the batch, given a dataset of paired sequences.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        The dataset to use for training.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If provided, will be used to automatically process the inputs
        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
        reuse the fine-tuned model.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.

        If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`].
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.
    callbacks (`list[transformers.TrainerCallback]`):
        The callbacks to use for training.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.

.. deprecated:: 0.22.0
    The following parameters are deprecated and will be removed in a future version:

    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
trlz
online-dpoNrS   	ref_modelreward_funcsrU  rX   data_collatortrain_dataseteval_datasetprocessing_classreward_processing_classespeft_configr   compute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreward_modelreward_processing_classr   c                   > [        US5      (       a)  [        US5      (       a  [        USS5      S:X  a  SUl        X!L a  [        S5      eX l        Ub2  [
        R                  " S5        Uc  UnO[
        R                  " S5        Ub2  [
        R                  " S5        U
c  Un
O[
        R                  " S	5        [        S
 XC4 5       5      nUS:X  a  [        S5      eUS:  a   Ub  [        R                  " S[        5        S nX@l        UGb  [        U[        5      (       d  U/n/ U l        UR                  =(       d    0 n[!        U5       H  u  nn[        U["        5      (       a  [$        R&                  " U4SS0UD6UU'   [        UU   [(        R*                  5      (       aF  U R                  R-                  UU   R.                  R0                  R3                  S5      S   5        M  U R                  R-                  UU   R4                  5        M     X0l        U
c  S /[9        U5      -  n
O<[        U
[        5      (       d  U
/n
O#[9        U
5      [9        U5      :w  a  [        S5      e/ U l        [=        X5       H  u  nn[        U[>        5      (       af  Uc*  [@        R&                  " UR.                  R0                  5      nURB                  c  URD                  Ul#        URB                  UR.                  l!        U R:                  R-                  U5        M     OS U l        / U l        / U l        Ub  URH                  b  [9        URH                  5      [9        U R6                  5      :w  a8  [        S[9        URH                  5       S[9        U R6                  5       S35      e[J        RL                  " URH                  [J        RN                  S9U l$        OC[J        RP                  " [9        U R6                  5      [J        RN                  S9U l$        OS U l$        URR                  b/  Uc,  Uc)  Ub  [        R                  " S[T        SS9  O[        S5      eUc  [        S5      eU	c  [        S5      eUR                  =(       d    0 n[        U["        5      (       a  UnURW                  S5      n[        U[J        RX                  5      (       d	  US:X  d  Uc  O:[        U["        5      (       a  [        [J        U5      nUUS'   O[        SU S 35      e[Z        R&                  " U40 UD6nOUR                  b  [        S!5      eUR.                  R\                  U l.        UR.                  R^                  [`        Rb                  " 5       ;   U l2         URh                  (       a  U Rk                  X5      nURl                  (       a-  [o        U5        U R                  b  [o        U R                  5        Uc	   S U l        O X l        U R                  Rs                  5         Ub0  U H*  n[        U[>        5      (       d  M  URs                  5         M,     URt                  U l:        / / / / / / / / / / / S".U l;        U R6                  b-  / U Rv                  S#'   / U Rv                  S$'   / U Rv                  S%'   UR                  U l        SU l<        URz                  U l=        UR|                  U l>        UR~                  U l?        UR                  U l@        UR                  U lA        UR                  U lB        UR                  (       a  UR                  OS U lC        UR                  U lD        UR                  U lE        UR                  U lF        [        U	[        5      (       a  U	R                  nO#[        U	[        5      (       a  U	nO[        S&5      eURF                  c  URD                  Ul#        URF                  U l#        URB                  U l!        UR                  U lK        [        U	S'S 5      U lL        [        U	S(S 5      U lM        [        U	S)S 5      U lN        S U lO        U R                  b!  UR                  U R                  /5      U lO        Uc  [        U RB                  S*9nSUR                  S+'   [        T U G]Q  UUUUUU	UUUUS,9
  [        U R                  S-5      (       a%  U R                  R                  U R                  5        UR                  U lY        U R                  (       Ga  [        5       (       d  [        S.5      eU R                  S/:X  a  U R                  R                  (       a  UR                  b  UR                  nOS0UR                   S1UR                   3n[        UUR                  S29U lc        U R                  R                  [J        R                  R                  5       S39  GOS U lc        GOU R                  S4:X  Ga  UR                  U R                  U R                  U R                  U R                  R                  U R                  -  URt                  UR                  -   S5U R                  R                  U R                  -  S6S7.	n[#        U R                  R                  5      [        R                  S8'   [#        U R                  R                  5      [        R                  S9'   [#        U R                  R                  5      [        R                  S:'   [        R                  RW                  S;S<5      [        R                  S;'   [        R                  RW                  S=S>5      [        R                  S='   UR                  U lq        O[        S?U R                   S@35      eUR                  U ls        SU lt        SU R                  U Rz                  U R|                  U R~                  c  SOU R~                  U R                  c  SAOU R                  UR                  SSB.nUR                  b  UR                  UR                  5        U R                  (       a  [        U R                  SC9USD'   [        SN0 UD6U ly        U R                  R                  5         GOUR                  SU RB                  UR                  U R                  U Rz                  U R~                  U R|                  U R                  U R                  Rh                  (       d  SOSSE.
nU R                  b  U R                  USF'   UR                  b  UR                  UR                  5        U R                  (       a  SGUSH'   SIUSJ'   SKUSL'   UR                  5        VVs0 s H  u  nnUc  M  UU_M     nnn[        SN0 UD6U ly        U R                  (       a  U R                  b=  [        U R                  UR                  UGR                   UGR                  5      U l        U R6                  bZ  [!        U R6                  5       H@  u  nn[        U[>        5      (       d  M  [        UU R                  5      U R6                  U'   MB     g g U R                  b6  U R                  GR                  U R                  GR                  5      U l        U R6                  b`  [!        U R6                  5       HF  u  nn[        U[>        5      (       d  M  U R                  GR	                  USSSM9U R6                  U'   MH     g g s  snnf )ONvllm_enginerc  FTz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, either omit the `ref_model` argument or pass `None`.zThe `reward_model` parameter is deprecated and will be removed in version 0.25.0. Please use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.zfBoth `reward_model` and `reward_funcs` are provided. Using `reward_funcs` and ignoring `reward_model`.zThe `reward_processing_class` parameter is deprecated and will be removed in version 0.25.0. Please use `reward_processing_classes` instead. For example, change `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.zBoth `reward_processing_class` and `reward_processing_classes` are provided. Using `reward_processing_classes` and ignoring `reward_processing_class`.c              3   (   #    U  H  oS Lv   M
     g 7fNr   ).0xs     r\   	<genexpr>4_UnslothOnlineDPOTrainer.__init__.<locals>.<genexpr>  s     J4Iqd]4Is   r   z2One of `judge` or `reward_funcs` must be provided.ro   zXBoth `judge` and `reward_funcs` are provided. Using `judge` and ignoring `reward_funcs`.
num_labels/ri   zRThe number of reward processing classes must match the number of reward functions.zNumber of reward weights (z)) must match number of reward functions ()dtypezThe `missing_eos_penalty` parameter is deprecated when used with the deprecated `reward_model` parameter. Please use `reward_funcs` instead of `reward_model` to continue using this feature.r   )
stacklevelzH`missing_eos_penalty` is only supported when `reward_funcs` is provided.z`args` must be provided.z$`processing_class` must be provided.r  r   zInvalid `dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .zYou passed `model_init_kwargs` to the `OnlineDPOConfig`, but your model is already instantiated. This argument can only be used when the `model` argument is a string.)objective/klobjective/entropyobjective/non_score_rewardrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/contain_eos_tokenbetaobjective/rlhf_rewardobjective/scores_marginobjective/scoreszWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`image_token_idvision_start_token_idvision_end_token_id)r   estimate_tokens)
rS   rX   r  r  r  r  r  r  r  r  add_model_tagszkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutr   r   external_launcheri   )	rS   tensor_parallel_sizerq  
model_implmax_num_seqsmax_model_lendistributed_executor_backendr   max_num_batched_tokensRANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345z6vllm_mode must be either 'server' or 'colocate', got 'z'.r   )nr\  rX  rY  rZ  r[  
max_tokens
detokenize)regexguided_decoding)
rV  	do_sampler   bos_token_ideos_token_idrX  rZ  rY  r\  	use_cacher[  r   max_batch_tokensi   
num_blocks   
block_size)evaluation_modedevice_placementr   )rV   getattrrc  r   r  rG   warnr   r6   r   UserWarningrU  
isinstancelistreward_func_namesrn  	enumeratestrr   from_pretrainedr9   Modulerz   config_name_or_pathsplitr{  r  lenr  rs   r    r   r   	eos_token	pad_tokenro  rC   r   ru   onesr`  DeprecationWarninggetr  r   is_encoder_decoder
model_typer   keysis_vision_modelr>   r9  _enable_gradient_checkpointingrb  r+   r*   evalrW  statsnum_generationsrX  rY  rZ  r[  r\  r^  rg  rf  rl  rd  r"   	tokenizerr!   	TypeErrorr  r  r  r  image_tokendecoder   warnings_issuedrw  rx  rS   r  
_tag_namesr  _betar3   ImportErroracceleratoris_main_processrh  ri  rj  r'   rk  vllm_clientinit_communicatorcudacurrent_devicename_or_pathrX   r   rV  process_indexr;   environlocal_process_indexnum_processesr  llmre  guided_decoding_regex_last_loaded_stepr]  updateGuidedDecodingParamsr   generation_configwait_for_everyoner  itemsr   is_deepspeed_enabledr=   r  r  rt   r   prepare_model)!rW   rS   r  r  rU  rX   r  r  r  r  r  r  r  r  r  r  r  r  reward_configsrn  ireward_funcreward_processing_class_imodel_idr  r  r  vllm_kwargsgeneration_paramsr]  kvry  s!                                   r\   rx  !_UnslothOnlineDPOTrainer.__init__  sb   , 5-((WT:-F-Fj%0E9 $X 
 # #MMw
 #+&
 #.MM` )0,C)Z JU4IJJQQRRa n  $
 #lD11 ,~%'D" !% 6 6 <""+L"9;k3//&H&X&X#'01'5F'LO l1oryy99**11,q/2H2H2V2V2\2\]`2abd2ef**11,q/2J2JK #: !- )0-1FS5F,F) 94@@-F,G)01S5FF$l  .0D*:=>W:f6);k?;;084A4Q4QR]RdRdRrRr4s10==E>W>a>a1;6O6\6\K&&3..556OP ;g !%D%'D"-/D* #"".t**+s43D3D/EE$4S9L9L5M4N O&&)$*;*;&<%=Q@  ',ll43F3Femm&\#&+jjT5F5F1Gu}}&]#"&D##/L4HU]'j& 	 !!kll<788 #CDD 228beS!!H &))'2E%--&EME3''u--2!'* OOTgUVX 
 )88WEVWE%%1 \  #(,,"A"A$||66:d:i:i:kk &&77DE $U+~~)(8  "&&NNN! #+k?;;$$&  , // !#*,  ""$! %'

 (24DJJ./46DJJ01-/DJJ)*  ++ZZ
ZZ
ZZ
"&"9"9&*&A&A#+/==d+/+K+K()-)G)G&#33 &77(22I(*ABB(Iuvv&"+"5"5I",,%22%22 &&68H$O%,-=?VX\%]"#*+;=RTX#Y *(//1D1D0EFD  6DDUDUVM 48/0''%-+!*G 	 	
 4::/00JJ%%doo6YY
 ===$&&!4 
 ~~)##3300<#'#<#<%,T-B-B,C1TEZEZD[#\'18X\XpXp'qD$$$66ejj>W>W>Y6Z'+D$:-"//,0,J,J.2.N.N"&"6"6$(II$I$IDLjLj$j%)__t7J7J%J4G ,,::d>\>\\.2
 &))9)9)G)G%H

6"+.t/?/?/S/S+T

<(+.t/?/?/M/M+N

<(,.JJNN=+,V

=),.JJNN=',R

=) ,, #YZ^ZhZhYiik!lmm)-)H)HD&%'D"&*&=&=#//#zz1tzz $

 2

"11#	! %%1!(()?)?@))7KRVRlRl7m!"34%3%H6G%HD"..0 #'"5"5! $ 1 1 ) 6 6 $ 1 1#//&*&=&=)-)I)ITu! zz%-1ZZ!'*%%1!(()?)?@**8;!"4526!,/25!,/2C2I2I2K ]2K$!QqA2K ]%5%J8I%JD"$$~~)!2NND$D$DdiiQUQZQZ"   ,&/0A0A&BNA{!+??/@dN^N^/_))!, 'C -
 ~~)!%!2!243C3C3J3J!K  ,&/0A0A&BNA{!+??/3/?/?/M/M'PT 0N 0))!, 'C -# !^s   
|;-|;c                     [        U R                  [        5      (       aM  U R                  R                  nU[        U R                  5      :  a  U R                  U   $ U R                  S   $ U R                  $ )Nri   )r  r  r  stateepochr  )rW   r  s     r\   r  _UnslothOnlineDPOTrainer.beta[  sX    djj$''JJ$$E(-DJJ(?4::e$STZZPR^S::r_   r  r  c                 L   U(       dd  U" U S   SS9nUR                   bL  [        US   5      nUS:X  d  UR                   US   S   :w  a"  UR                   /US   -   US'   S/US   -   US'   O
U" U S   SS9nUR                  5        VVs0 s H  u  pVS	U 3U_M     nnnU$ s  snnf )
z2Tokenize a single row from a DPO specific dataset.promptF)add_special_tokensr   r   ro   attention_maskTprompt_)r  r  r  )featurer  r  batchprompt_len_input_idskeyvalues          r\   tokenize_row%_UnslothOnlineDPOTrainer.tokenize_rowc  s     "gh/EJE%%1'*5+='>$'1,	0F0F%P[J\]^J_0_*3*@*@)AE+DV)VE+&/0cE:J4K.KE*+gh/DIE:?++-H-JC73%%'-H Is   	B c                 J   U R                   c  [        S5      eU R                   nU R                  nU R                  UU R                  R
                  U R                  R                  U R                  R                  S.n[        U[        R                  R                  R                  5      (       dN  U R                  5       US'   U R                  R                  US'   [        US'   U R                  R                   US'   U R"                  R%                  ['        U40 UD65      $ )Nz+Trainer: training requires a train_dataset.r   
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_lastworker_init_fnprefetch_factor)r  r   r  _train_batch_sizerX   r  r-  r.  r  rC   utilsdatar   _get_train_samplerr  rA   r  r  preparer   )rW   r  r  dataloader_paramss       r\   get_train_dataloader-_UnslothOnlineDPOTrainer.get_train_dataloadert  s    %JKK****00'99;;))99"&))"I"I
 -)9)9)I)IJJ+/+B+B+Di(-1YY-K-Kk*2=./37993W3W/0''
=(VDU(VWWr_   c                 (   Uc  U R                   c  [        S5      e[        U[        5      (       a  UOSn[	        U S5      (       aR  X R
                  ;   aC  U R                  R                  (       a(  U R                  R                  U R
                  U   5      $ [        U[        5      (       a  U R                   U   OUb  UOU R                   nU R                  nU R                  R                  UU R                  R                  U R                  R                  U R                  R                  S.n[        U[        R                  R                   R"                  5      (       dF  U R%                  U5      US'   U R                  R&                  US'   U R                  R(                  US'   [+        U40 UD6nU R                  R                  (       a(  [	        U S5      (       a  XPR
                  U'   OX%0U l        U R                  R                  U5      $ )Nz-Trainer: evaluation requires an eval_dataset.r  _eval_dataloadersr,  r1  r2  r4  )r  r   r  r  rV   r>  rX   r.  r  r9  r  eval_batch_sizer  r-  rC   r6  r7  r   _get_eval_samplerr  r  r   )rW   r  dataloader_keyr  r:  eval_dataloaders         r\   get_eval_dataloader,_UnslothOnlineDPOTrainer.get_eval_dataloader  s   D$5$5$=LMM *4L#)F)FFD-.."8"88		77##++D,B,B>,RSS ,,, l+ ' "" 	 ** ))33'99;;))99"&))"I"I
 ,(8(8(H(HII+/+A+A,+Oi(-1YY-K-Kk*37993W3W/0 %\G5FG9922t0119H&&~6*8)J&''88r_   c                    SUR                   l        [        U5      (       a  UR                  R	                  5         OUR	                  5         UR
                  =(       d    0 nSU;  =(       d    US   nU(       a  UR                  5         U$ )z-Enables gradient checkpointing for the model.Fuse_reentrant)r  r  r2   
base_modelgradient_checkpointing_enabler:  enable_input_require_grads)rW   rS   rX   r:  rF  s        r\   r  7_UnslothOnlineDPOTrainer._enable_gradient_checkpointing  s~     "' ::< //1(,(J(J(Pb%#@@rDabqDr 	 ,,.r_   c           	      @   U R                   nU R                  nU R                  S:X  a  U R                  X5      u  pVO#U R                  S:X  a  U R	                  X5      u  pV[        S W 5       5      nU Vs/ s H%  nS/U[        U5      -
  -  S/[        U5      -  -   PM'     n	nU Vs/ s H  o/U[        U5      -
  -  U-   PM     nnU R                  R                  n
W Vs/ s H%  nS/[        U5      -  S/U
[        U5      -
  -  -   PM'     nnU Vs/ s H"  nUS   U:w  a  [        U5      U
:  a  X/-   OUPM$     nnU Vs/ s H  oU/U
[        U5      -
  -  -   PM     nn[        R                  " X`R                  R                  S9n[        R                  " XR                  R                  S9n	[        R                  " XPR                  R                  S9n[        R                  " XR                  R                  S9nXiX[4$ s  snf s  snf s  snf s  snf s  snf )Nr  r   c              3   8   #    U  H  n[        U5      v   M     g 7fr  )r  )r  idss     r\   r  :_UnslothOnlineDPOTrainer._generate_vllm.<locals>.<genexpr>  s     ?JSCJs   r   ro   ri   r   )r  r   rg  _generate_vllm_server_generate_vllm_colocateru  r  r  r  rC   r   r  r   )rW   promptsimagesr  r   completion_ids
prompt_idsmax_prompt_lengthrM  prompt_maskr  completion_masks               r\   _generate_vllm'_UnslothOnlineDPOTrainer._generate_vllm  s   (((( >>X%)-)C)CG)T&NJ^^z))-)E)Eg)V&N  ?J??XbcXbQTs/#c(:;qcCHnLXbcWabWaPSn(9CH(DEKWa
b++66
UcdUccA3S>QC:C3H,IIUcd &
% %(G|$;C:@UC. [^^% 	 
 UccTbS*s3x2G HHTbc \\*5E5E5L5LM
ll;7G7G7N7NOn=M=M=T=TU,,?O?O?V?VWGG! dbd
 ds   6,H(H",H)HHc                    USLn[        U S5      (       aP  U R                  R                  U R                  :w  a,  U R	                  5         U R                  R                  U l        O<[        U S5      (       d+  U R	                  5         U R                  R                  U l        [        SUS   05      (       a,  U Vs/ s H  n[        SU0U R                  5      S   PM      nnOUn[        U5      nU(       a  [        U5      nU R                  R                  (       Ga  USSU R                  2   nU(       a  WSSU R                  2   n	OSn	U R                  R                  UU	U R                  U R                  U R                  U R                   U R"                  c  SOU R"                  U R$                  c  SOU R$                  U R&                  R(                  [        U S5      (       a  U R*                  OSU R,                  R.                  S9n
U
 VVs/ s H  o  H  o/PM     M     n
nnOS/[1        U5      S	-  -  n
[3        U
SS
9n
[5        U R                  R6                  [1        U5      -  S	-  U R                  R6                  S-   [1        U5      -  S	-  5      nX   n
U R                  USSSSS9n/ nUS    H2  nUR9                  UR;                  5       UR;                  5       /5        M4     X4$ s  snf s  snnf )z+Generate completions using vLLM server modeNr	  r   r   ri   r   r  )rQ  rR  r  r\  rX  rY  rZ  r[  r  r  r]  r   )from_processro   ptTleftFtextreturn_tensorspaddingpadding_sider!  r   )rV   r  global_stepr	  _move_model_to_vllmr0   r(   r  r-   r  r  r  r  generater\  rX  rY  rZ  r[  r  r  r  rX   r]  r  r)   slicer  extendtolist)rW   rQ  rR  
has_imagespprompts_textall_prompts
all_imagesordered_set_of_promptsordered_set_of_imagesrS  prompt_completionscomp_idprocess_sliceprompt_inputsrT  prompt_tokenss                    r\   rO  ._UnslothOnlineDPOTrainer._generate_vllm_server  s   4'
 4,--$**2H2HDLbLb2b$$&%)ZZ%;%;D"233$$&%)ZZ%;%;D" h
344ipqipde/1t?T?TUV^_ipLqL"L#L1&v.J+++ &11HD4H4H1H%I"(23Jd6J6J3J(K%(,%!--66.,&&#'#:#: ,,jj JJ.bDJJ!ZZ/cTZZ11<<DKDRiDjDjd&@&@pt"&))"="= 7 N CQs.,>`rU\i`ri.NsN"Vs;'7!';<N /~AN **S\9A=++a/3w<?!C
 (6 --$ . 
 
*;7M}335}7K7K7MNO 8))q r> ts   ;%K4K9c           
         U R                  5         [        SUS   05      (       a,  U Vs/ s H  n[        SU0U R                  5      S   PM      nnOUnUbC  / n[	        XB5       H1  u  pgUb  UR                  USU0S.5        M   UR                  U5        M3     OUnU R                  R                  XPR                  SU R                  R                  SSS9S	9n[        S
5       V	V
s/ s H.  o  H%  n
[        U
R                  U	   R                  5      PM'     M0     nn	n
[        S
5       VV
s/ s H!  o  H  n
[        U
R                  5      PM     M#     nnn
X4$ s  snf s  sn
n	f s  sn
nf )z-Generate completions using vLLM colocate moder   r   image)r   multi_modal_dataFonline_dpo_trainer_lora_modelT)load_tensors)use_tqdmlora_requestr   )rd  r0   r(   r  rs   rz   r  re  r  rS   	load_loraranger  outputs	token_idsprompt_token_ids)rW   rQ  rR  rj  rk  vllm_inputsr   rw  r  r  rZ   rS  _rT  s                 r\   rP  0_UnslothOnlineDPOTrainer._generate_vllm_colocate8  sv    	  " h
344ipqipde/1t?T?TUV^_ipLqL"L K!$\!:$&&&wX]N^'_`&&v.	 "; 'K((##K1G1GRWhlhrhrh|h|  ~]  nrh|  is#  tEJ1XdX\cRX$v~~a0::;\c;Xd=B1X\XT[&d6223T[3X
\))) r" e\s   %E'5E -(E&c                 8   U R                   R                  R                  nUSL=(       a    UR                  S:H  nU(       a  SSKnUR
                  R                  nO[        n[        U R                  5      (       Ga  U" [        U R                  R                  5       5      5         U R                  R                  5         U R                  (       a|  [        U R                   R                  SS5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        GO US:X  a  U R!                  U R                  5        OU R                  R#                  5        H  u  pxUR%                  S5      R'                  S	S
5      nU R                  R(                  U;   a  MB  SU;   a  MJ  U R+                  US/S9nU R,                  S:X  aB  U R                   R.                  (       a'  U R0                  R3                  XxR4                  5        M  U R,                  S:X  d  M   M     U R                  R7                  5         SSS5        GO5U R                  (       a{  [        U R                   R                  SS5      nU(       a  [        USS5      OSnUS:X  a  U R                  U R                  5        OUS:X  a  U R!                  U R                  5        OU R                  R#                  5        H  u  pxU R+                  U5      nU" U/5         U R,                  S:X  aA  U R                   R.                  (       a&  U R0                  R3                  XxR4                  5        OU R,                  S:X  a    SSS5        M     U R,                  S:X  a6  U R                   R.                  (       a  U R0                  R9                  5         gU R,                  S:X  a  U R:                  R9                  5         gg! , (       d  f       N= f! , (       d  f       GM"  = f)zSSynchronize model weights to vLLM server with support for PEFT, DeepSpeed, and FSDPN   r   fsdp_pluginfsdp_versionro   r   zbase_model.model.z.base_layerr   original_modulezmodules_to_save.default.extra_prefixesr  r   )r  r  deepspeed_plugin
zero_stager"  zeroGatheredParametersr:   r2   rS   r  
parametersmerge_adapteris_fsdp_enabledr  _sync_fsdp1_params_to_vllm_sync_fsdp2_params_to_vllmnamed_parametersremoveprefixreplaceprefix_fix_param_name_to_vllmrg  r  r  update_named_paramr7  unmerge_adapterreset_prefix_cacher  )	rW   r  zero_stage_3r"  gather_if_zero3r  r  nameparams	            r\   rd  ,_UnslothOnlineDPOTrainer._move_model_to_vllmU  s#     ++11BB't3X8H8S8SWX8X'nn??O)O$$ !djj&;&;&=!>?

((* '' #*$*:*:*@*@-QU"VKNY7;#J_`L#q(77

C%*77

C (,zz'B'B'D#001DEMMm]_`::,,4$,4$#;;DRlQm;n>>X5$:J:J:Z:Z ,,??jjQ!^^z9  ! (E$ 

**,E @?L ##%d&6&6&<&<mTRJUw{NAF[\1$33DJJ?!Q&33DJJ?#'::#>#>#@KD77=D(%1>>X5$:J:J:Z:Z ,,??jjQ!^^z9   21 $A >>X%$*:*:*J*J//1^^z)HH'') *y @?` 21s    'E=O8(O8A$P	8
P	
P	moduler  c                    Uc
  [        5       nUR                  5        H%  u  pEU(       a  U SU 3OUnU R                  XVUS9  M'     [        U[        5      (       a  [        R
                  " USSS9   UR                  5        H  u  pxU(       a  U SU 3OUn	U R                  U	S/S9n	X;   a  M-  UR                  U	5        U R                  S:X  aB  U R                  R                  (       a'  U R                  R                  XR                  5        M  U R                  S	:X  d  M   M     SSS5        gg! , (       d  f       g= f)
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nr  )r  visitedF)recurse	writebackz_fsdp_wrapped_module.r  r  r   )setnamed_childrenr  r  r   summon_full_paramsr  r  addrg  r  r  r  r  r7  )
rW   r  r  r  
child_namechild_modulechild_prefix
param_namer  	full_names
             r\   r  3_UnslothOnlineDPOTrainer._sync_fsdp1_params_to_vllm  s&    ?eG(.(=(=(?$J7=fXQzl3:L++7 ,  )@ fd##((%P)/)@)@)B%J<B6(!J< 8
I $ < <YXoWp < qI + KK	*~~1d6F6F6V6V((;;IzzR:5 *C QP $PPs   2B/D4%D44
Ec                 x   UR                  5        H  u  p#UR                  (       a%  UR                  [        R                  " S5      5      nUR                  5       nU R                  S:X  a8  U R                  R                  (       a  U R                  R                  X#5        M  U R                  S:X  d  M   M     g )Nr   r  r   )r  is_cpurt   rC   r   full_tensorrg  r  r  r  r  )rW   r  r  r  s       r\   r  3_UnslothOnlineDPOTrainer._sync_fsdp2_params_to_vllm  s    !<<>KD||f!56%%'E~~)d.>.>.N.N  33D@:- *r_   r  c                 ^    U=(       d    / nS/U-   nU H  nUR                  US5      nM     U$ )z,Clean parameter names for vLLM compatibilityz_checkpoint_wrapped_module.r   )r  )rW   r  r  prefixesr  s        r\   r  0_UnslothOnlineDPOTrainer._fix_param_name_to_vllm  s8    '-212^CF<<+D r_   featuresc                     U=(       d    U R                   nU" US   /US   SS9nUS   S   nUUS   S   S.nS	U;   a  US	   S   US	'   S
U;   a  US
   S   US
'   SU;   a  US   S   US'   U$ )z@
Process a vision row for VLM models (adapted from DPO trainer)
rw  r   F)rR  r_  r!  r   r   r"  )prompt_input_idsprompt_attention_maskpixel_valuespixel_attention_maskimage_sizes)r  )rW   r  r  	processorprocessed_featuresr  rZ   s          r\   process_vision_row+_UnslothOnlineDPOTrainer.process_vision_row  s     %=(=(=	&x/@.AQYHZotu-k:1= !1%78H%I!%L
 //%7%G%JF>"!%77-?@V-WXY-ZF)*..$6}$Ea$HF=!r_   c                 V   [        UR                  5       5      R                  nU R                  nU R                  nU Vs/ s H  nSU0PM	     nnUb  [        U5       H  u  pXU	   S'   M     U Vs/ s H  n[        XR                  5      S   PM     nnU R                  Gbn  UGbj  [        R                  " U R                  5      n[        U R                  S5      (       Ga.  U R                  R                  (       Ga  [        R                  " XR                  R                  5      (       a7  U Vs/ s H)  n[        R                  " SU S3U R                  U5      PM+     nnOU R                  bs  [        R                  " U R                  R                   R#                  U R                  /5      5      nU Vs/ s H!  n[        R                  " SU SU 3SU5      PM#     nnO,U Vs/ s H  n[        R                  " SU S3SU5      PM!     nn0 nUb  SU Vs/ s H  nU/PM     sn0nU R                  " S$US	S
SSS.UD6nUR%                  5        VVs0 s H  u  nnUUR'                  U5      _M     nnnSU;   aQ  [)        USS5      nUc'  [        US5      (       a  UR*                  R,                  nUb  US   R'                  U5      US'   US   R/                  SS5      nUS   R/                  SS5      n0 nU R0                  (       a}  Ubz  SU;   a  US   R/                  SSSS5      US'   SU;   a  US   R/                  SS5      US'   SU;   a  US   R/                  SS5      US'   SU;   a  US   R/                  SS5      US'   U R2                  (       Ga  U R4                  R6                  R8                  n[;        5       (       a  SU R4                  R6                  l        OSU R4                  R6                  l        [=        U S5         [?        XR@                  U RB                  RD                  S9 n[F        RH                  " 5          U RJ                  (       a  [L        RN                  " U R4                  SS9O	[Q        5          U RB                  RR                  (       a   UR'                  [F        RT                  5        O:U RB                  RV                  (       a  UR'                  [F        RX                  5        [F        RZ                  " 5          UR]                  UR_                  5       U R`                  SS9nSSS5        SSS5        SSS5        SSS5        SSS5        WRc                  5        Vs/ s H  nURd                  PM     nnU Vs/ s H  n[F        Rf                  " UUS9PM     nn[i        UU R                  SS 9n[F        Rj                  " UU/SS!9nUU R4                  R6                  l        URm                  S5      n USS2U S24   n[o        UXV5      u  nn!UUUU!4$ [=        U S"5         [?        XR@                  U RB                  RD                  S9 n[F        RH                  " 5          U RJ                  (       a  [L        RN                  " U R4                  SS9O	[Q        5          U RB                  Rp                  b%  U RB                  Rp                  UR`                  l8        URr                  " S$UUU R`                  S#.UD6nSSS5        SSS5        SSS5        SSS5        WSS2URm                  S5      S24   n[o        UXV5      u  nn!UUUU!4$ s  snf s  snf s  snf s  snf s  snf s  snf s  snnf ! , (       d  f       GNL= f! , (       d  f       GNV= f! , (       d  f       GN`= f! , (       d  f       GNj= f! , (       d  f       GNt= fs  snf s  snf ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f)%z$Generate completions using the modelr   Nrw  chat_template(z)+r   rR  r\  Tr]  Fr^  r  r  r  r   r   ro   r"  r  r  image_grid_thwpaged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r  )r  progress_barr   right)padding_valuerb  rn   ztransformers.generate)r   r"  r  r   ):nextr  r   r  r   r  r8   r  r  r@   escaperV   r  searchsubr  r  r  r  rt   r  r  r  repeatr  r^  model_wrappedr  _attn_implementationr1   r?   rE   r  rX   rm  rC   no_gradr  r   r  r:   r  bfloat16r  float16inference_modegenerate_batchrh  r  valuesgenerated_tokensr   r<   catsizerD   r_  re  )"rW   rS   rQ  rR  r   r  r   r   inputsr  rw  r  rk  escaped_img_tokenr_  escaped_eoi_tokenrY   imgrs  r  r  model_dtyperT  rV  vision_generation_kwargsprevious_attnunwrapped_modelall_outputsrZ   rS  rM  prompt_completion_idsprompt_lengthrW  s"                                     r\   	_generate"_UnslothOnlineDPOTrainer._generate  sg   e&&()00(((( 4;;78V$7; %f-%*q	'" . `ff_eZ[1!5J5JKHU_ef 'F,> "		$*:*: ;t,,o>>4CXCXCfCfCf99.0E0E0S0STT `l$_kW[!$5#6b94;K;KTR_k ! $L
 //;,.II 11;;BBDD\D\C]^-) iu(ht`dBFFa(9':"=N<O#PRTVZ[ht % (
 bn'namY]!4E3Fb/I2t(Tam'n & 9&3#& 9:F -- 
$
 
 6C5H5H5JK5JTQADDL5JK]*!%$7K"wuh'?'?#ll00&0=n0M0P0PQ\0]n- #;/66q!<
#$45<<QB $& F$6.;H;X;_;_`acdfgij;k(8%6CPQgChCoCopqstCu()?@-:G:V:]:]^_ab:c(7=0=JK[=\=c=cdegh=i()9:&&& ..55JJM(**AR""))>AM""))>!$(EF+++diiFiFi$NRNbNb''(:(:EJhshuu 99>>#&&u~~6YY^^#&&u}}5))+"1"@"@"))+*.*@*@%* #A #K , v   G$ EPDVDVDXYDX&f55DXNYJXY.3ell3v>.NY t?P?P_fgN$)IIz>.JPQ$R!=JD%%: 'OOA.M21mn3DEN.<^\.h+NO{NOKK "$(?@+++diiFiFi$NRNbNb''(:(:EJhshuu 9911=MQYYMkMkO55J )11 (#.&*&<&< /	 v   A& $Azq'9';$;<N.<^\.h+NO{NOKK[ < g$(
 (o
 !: L\ ,+ vu    GF$ ZY( vu    A@s    ]2"]$30]);(].*&]3]8]=)_8^9;^'	B^	*^>^	^'^9_8__"9)`"`	8;_83A_'	_8`	 `
^^	
^$^''
^61^99
_	_
_'
_51_88
``		
`	`
`(c           	         U R                   R                  n[        R                  " [	        U5      [	        U R
                  5      US9nU R                  US'   [        [        U R
                  U R                  5      5       GH{  u  nu  p[        U[        R                  5      (       a  [        SUS   05      (       aB  [        X5       V
Vs/ s H  u  pSX-   0PM     nn
nU Vs/ s H  n[        X5      S   PM     nnO![        X5       V
Vs/ s H	  u  pX-   PM     nn
nU	" USSS	S
S9nUR                  5        VVs0 s H  u  nnUUR!                  U5      _M     nnn[        R"                  " 5          U" S0 UD6R$                  SS2S4   USS2U4'   SSS5        GM   U" SXUS.UD6nU Vs/ s H  nUb  UO[        R&                  PM     nn[        R(                  " U[        R*                  US9USS2U4'   GM~     U R,                  b;  X`R,                  R!                  U5      R/                  S5      -  R1                  SS9nU$ UR1                  SS9nU$ s  snn
f s  snf s  snn
f s  snnf ! , (       d  f       GM   = fs  snf )z*
Calculate rewards using reward functions
r   trainer_stater   r   messagesr_  r\  Tr  Fr^  N)rQ  completionsrS  )r  r   ro   rn   r   )r  r   rC   zerosr  r  r  r  rs   r  r  r9   r  r0   r(   r  rt   r  r|   nanr   ru   ro  rw   nansum)rW   rQ  r  completion_ids_listreward_kwargsr   rewards_per_funcr  r  r  rj  cr  r  textsreward_inputsr  r  output_reward_funcrewardtotal_rewardss                        r\   !_calculate_rewards_from_functions:_UnslothOnlineDPOTrainer._calculate_rewards_from_functions  s}    !!(( ;;s7|S9J9J5KTZ[ *.o&9B!!4#A#AB:
5A5 +ryy11$h
%;<<@CG@YZ@YQU 3@YHZ^fg^fYZ0LVT^fEgE/27/HI/HtqQU/HEI !8tTPWlq! >K=P=P=R S=RTQADDL=R S))+-8-I=-I-P-PQRTUQU-V$QT* ,+ &1 &#M`&dq&" as%s`rV\0Bf		&Q`r"%s).6HPUP]P]fl)m A&5:
: *-0C0C0F0Fv0N0X0XYZ0[[cchicjM  -333:M9  [gI !T++ &ts*   I%I I%>I+5!I10J1
J	c                 v   [        UR                  S5      UR                  S5      -   U R                  -
  S5      nUS S 2US 24   nUS S 2US 24   n[        R                  " X$4SS9n[        R                  " X54SS9n	SU	0n
Ub8  SU;   a  US   U
S'   SU;   a  US   U
S'   SU;   a  US   U
S'   SU;   a  US   U
S'   U" U40 U
D6nUR                  S5      nUS:  a  US-
  OSnUR
                  S S 2US	24   n[        R                  " UR                  S	S9UR                  S	5      S
S9R                  S	5      nU$ )Nro   r   rn   r"  r  r  r  r  ri   r   )
ru  r  rW  rC   r  r|   take_along_dimlog_softmaxrw   rx   )rW   rS   rT  rV  rS  rW  vision_inputsnum_tokens_to_truncater  prompt_completion_maskmodel_kwargsrZ   
prompt_len	start_idxr|   logprobss                   r\   _forward!_UnslothOnlineDPOTrainer._forward  s   !$Z__Q%7.:M:Ma:P%PSWSbSb%bde!f  #9#: :;
!!%;%<"<= !&		:*FA N!&K+Iq!Q )*@A$./<^/L^,%67DE[7\34-.;M.J]+=01>?O1P-. ,==  __Q'
&01nJN!	q)B,/ ''(:(:r(:(BND\D\]_D`fghppqstr_   r  num_items_in_batchc                    UR                  5         US   n[        U5      nSU;   nS nU(       a  US   nU H  n[        U[        5      (       d  M  U Hx  n	[        U	[        5      (       d  M  U	R                  S5      n
U	R                  S5      n[        U
[        5      (       d  MS  US:X  a  SS0SU
S./U	S'   Mg  US	:X  d  Mo  SU
S./U	S'   Mz     M     U R                  R                  (       a  U R                  XG5      u  ppOU R                  XU5      u  pp[        R                  " XR                  :H  S
S9nS nU(       Gaw  U R                  (       Gae  U R                  R                  (       GdI  0 nSU Vs/ s H  nU/PM     sn0nU R                  " S5S/[        U5      -  SS.UD6n[!        USS 5      n[!        USS 5      nUc=  [#        US5      (       a,  UR$                  R&                  nUR$                  R(                  nSU;   a(  US   R+                  UUS9R-                  SSSS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   SU;   a'  US   R+                  U5      R-                  SS5      US'   U R/                  XXUU5      n[        R0                  " 5          U R2                  b  U R/                  U R2                  XXU5      nOAU R4                  R7                  5          U R/                  U R4                  XXU5      nS S S 5        S S S 5        UR&                  nU R                  R9                  USS9n[;        SUS   05      (       a  U Vs/ s H	  nSUS./PM     nnU R<                  b  [?        UR@                  S   5       Vs/ s H  nUU   RC                  5       PM     nn0 nU Vs/ s H  nUS;  d  M  UPM     n nU  H6  n[        UU   [        [D        45      (       a  UU   S-  UU'   M.  UU   UU'   M8     U RF                  " S5SU-  UUS .UD6n!U R                  RH                  b"  U!U) ==   U R                  RH                  -  ss'   U!RK                  U5      u  n"n#U"U#:  n$OU RL                  b  [;        SUS   05      (       ah  [N        RP                  " 5       n%U%RS                  [T        5      n&U Vs/ s H  nU&RW                  US!9PM     nnU Vs/ s H  nU&RW                  US!9PM     nnU RL                  RM                  U[        [Y        US U UUS  5      5      5      n'[        RZ                  " U' V(s/ s H  n(U(S:H  PM
     sn(US"9n$[        R\                  " UUS"9n)U)W$) U-  -   n*U)U$U-  -   n+[        R^                  " U*U+4SS9n,UU,   n-WU,   n.URa                  5       ) n/U/U,   n0U-U0) -  Rc                  S5      n1U.U0) -  Rc                  S5      n2[        RJ                  " U1U5      u  n3n4[        RJ                  " U2U5      u  n5n6U3U4-
  n7U5U6-
  n8U7U8-
  n9U R                  Rd                  S#:X  a%  [f        Rh                  " U Rj                  U9-  5      * n:OKU R                  Rd                  S$:X  a  U9SSU Rj                  -  -  -
  S-  n:O[m        S%U Rd                   35      eU:Ro                  5       n;U R<                  b  W!U*   U!U+   -
  n<U Rp                  S&   Rs                  U Rt                  Rw                  U<Ro                  5       5      Ro                  5       Ry                  5       5        U Rp                  S'   Rs                  U Rt                  Rw                  U!Ro                  5       5      Ro                  5       Ry                  5       5        U Rp                  S(   Rs                  UR{                  5       Ro                  5       Ry                  5       5        U Rp                  S)   Rs                  U Rt                  Rw                  U35      Ro                  5       Ry                  5       5        U Rp                  S*   Rs                  U Rt                  Rw                  U45      Ro                  5       Ry                  5       5        UU-
  n=U=Rc                  S5      Ro                  5       n>U Rp                  S+   Rs                  U Rt                  Rw                  U>5      Ro                  5       Ry                  5       5        U Rj                  * U=-  Rc                  S5      n?U?Ro                  5       n@U Rp                  S,   Rs                  U Rt                  Rw                  U@5      Ro                  5       Ry                  5       5        U R<                  bX  W!U?-   nAU Rp                  S-   Rs                  U Rt                  Rw                  UA5      Ro                  5       Ry                  5       5        URc                  S5      Ro                  5       * nBU Rp                  S.   Rs                  U Rt                  Rw                  UB5      Ro                  5       Ry                  5       5        U Rj                  U3U5-
  -  nCU Rt                  Rw                  UC5      nDU Rp                  S/   Rs                  UDRo                  5       Ry                  5       5        U Rj                  U4U6-
  -  nEU Rt                  Rw                  UE5      nFU Rp                  S0   Rs                  UFRo                  5       Ry                  5       5        UDUF-
  nGU Rp                  S1   Rs                  UGRo                  5       Ry                  5       5        UGS:  nHU Rp                  S2   Rs                  UHR{                  5       Ro                  5       Ry                  5       5        U Rp                  S3   Rs                  U Rj                  5        U R                  R|                  b;  U R~                  R                  U R                  R|                  -  S:X  a
  [        5         0 nU R                  R                  [        R                  [        R                  4;   a  U R                  5       US4'   U R                  R                  S:  a  U;Ro                  5       n;U R                  (       a:  [        R                  U;U R                  5       nIUIR                  5         S S S 5        OU Rt                  R                  " U;40 UD6  U;R                  5       U R                  R                  -  $ s  snf ! , (       d  f       G
N6= f! , (       d  f       G
N@= fs  snf s  snf s  snf s  snf s  snf s  sn(f ! , (       d  f       N|= f)6Nr   rw  contentroleusertyper_  )r  r_  systemri   rn   rR  r   r\  )r_  r`  r   r  r  r  r  r   ro   r  r  r  T)skip_special_tokensr   	assistant)r  r
  )r   )rQ  r  r  )r  r   r   ipozinvalid loss type r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   )Otrainr  r  r  dictr  r  rX   rc  rX  r  rC   anyr  r  r  r  rV   r  r   r  rt   r  r  r  r  rS   disable_adapterbatch_decoder0   r  r~  rr   rh  tupler  r`  r  rU  r5   Environmentfrom_stringr$   renderrs   r   r   r  boolr   ra  r   
logsigmoidr  NotImplementedErrormeanr  rz   r  gather_for_metricsitemfloatr   r  rc  r,   r$  r   LOMOADALOMO_get_learning_raten_gpuuse_apexamp
scale_loss	optimizerbackwarddetachr   )JrW   rS   r  r  rQ  r   ri  rR  r   messager
  r  rT  rV  rS  rW  contain_eos_tokenr  r  rY   	processedmodel_devicer  r  ref_logprobsr   r  
completionr  r  r  r'  r  rewards
first_halfsecond_halfr   environmenttemplateranks_of_first_completionrankbatch_rangechosen_indicesrejected_indices
cr_indicescr_logprobscr_ref_logprobsr   cr_padding_maskcr_logprobs_sumcr_ref_logprobs_sumchosen_logprobs_sumrejected_logprobs_sumchosen_ref_logprobs_sumrejected_ref_logprobs_sumpi_logratiosref_logratiosr|   losseslossscores_marginklmean_klnon_score_rewardmean_non_score_rewardrlhf_rewardmean_entropychosen_rewardsgathered_chosen_rewardsrejected_rewardsgathered_rejected_rewardsmarginaccuracyscaled_losssJ                                                                             r\   training_step&_UnslothOnlineDPOTrainer.training_step  sJ    	"\
 &
G_F!fd++#))'488$")++i"8&{{62%gs33#v~7=w6GRXbiIj5k	 2!%!1?Ew6W5X	 2 $* " 99GKGZGZ[bGkDJ^_GK~~V[flGmDJ^!IIn8I8I&IrR $...tyy7I7I7I M& 9&3#& 9:F-- TCK'# I #5(D9L!%$7K#x(@(@$||22#ll00 *n-00[0QXXYZ\]_`bcd n- &28ABX8Y8\8\]i8j8q8qrsuv8w45	)/8/G/J/J</X/_/_`acd/em,9,2;<L2M2P2PQ]2^2e2efgij2k./==KQ`bop]]_~~)#}}NNJ^^k  ZZ//1#'==

J^^k$L 2  ++88]a8bh
344\gh\gj[ZHI\gKh (GL^MaMabcMdGe"fGe!>!#4#;#;#=Ge"f M#)C6CS
-BC6DCfSkD%=99)/qM#&)/M#&  << GReivG
 yy,,8**+tyy/L/LL+ '.mmJ&?#J,DZZ#
 !(GAJ!788$002&223GHJQR'8??F?;'RVabVa
x
CVab(,

(8(8c+kz":K
<TUV)% <<7P Q7Pt7P QZ`aDll:f=$
(:;&$*;< YY0@AqI
z*&z2 (,,..&z2&/)99>>qA./1AAFFqI 6;[[R\5]22=B[[I\^h=i:!:*-BB/2KK-99)+ll499v#566FYY  E)qA		M22q8F%(:4>>:J&KLL{{} (#N3g>N6OOMJJ0188  33M4F4F4HINNPUUW JJ)*11$2B2B2U2UV]VbVbVd2e2j2j2l2q2q2st

*+223D3J3J3L3Q3Q3S3X3X3Z[

>"))$*:*:*M*MNa*b*g*g*i*n*n*pq

#$++D,<,<,O,OPe,f,k,k,m,r,r,tu$&&).."

>"))$*:*:*M*Mg*V*[*[*]*b*b*de!YYJO003 0 5 5 7

/077//0EFKKMRRT	
 (!$44KJJ./66t7G7G7Z7Z[f7g7l7l7n7s7s7uv Q,,..

&'..t/?/?/R/RS_/`/e/e/g/l/l/no&9<S&ST"&"2"2"E"En"U

#$++,C,H,H,J,O,O,QR99(=@Y(YZ$($4$4$G$GHX$Y!

%&--.G.L.L.N.S.S.UV(+DD

$%,,V[[]-?-?-ABA:

'(//0@0E0E0G0L0L0NO

6!!$)), II--9

&&)J)JJaOM 99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5$$& 65 %%d5f5{{}tyyDDDDo !:@ 21 _ i
 #g D8 Sb !Rt 65sa   7u2"Av	)u7v	vv 5
v%v%v*"v/v4v97
v	v		
v9
wc	                     U R                   R                  (       Ga  U R                  R                  U R                  :  Gat  0 n	U R                  U5      R                  5       R                  5       n
X-  n[        XR                  R                  U R                  -
  -  S5      U	S'   UbB  [        U[        R                  5      (       a  UR                  5       R                  5       OUU	S'   Ub  XS'   OU R                  5       U	S'   U R                  R                  5        H  u  p[!        U5      [#        U5      -  X'   M      U R                   Vs0 s H  o/ _M     snU l        U =R$                  U
-  sl        U R                  R                  U l        U R'                  5         U R)                  X5        S nU R                   R*                  (       aJ  U R-                  XF5      nU R/                  XS9nU R0                  R2                  S:X  a  XR                   l        U R                   R4                  (       aR  U R7                  X45        U R8                  R;                  U R0                  U R                  U R                   5      U l         g g s  snf )Nrj   rI  	grad_normr   )metricstrialbest)control
should_logr  rc  _globalstep_last_logged_nested_gatherr  r   roundr  rC   r   r+  r$  r  r  r   r  _total_loss_scalar
store_floslogshould_evaluate	_evaluate_determine_best_metricrX   r   should_save_save_checkpointcallback_handleron_save)rW   tr_lossr[  rS   r]  r  ignore_keys_for_eval
start_timer   logstr_loss_scalarr'  valr\  is_new_best_metrics                  r\   _maybe_log_save_evaluate1_UnslothOnlineDPOTrainer._maybe_log_save_evaluate  s    <<"""tzz'='=@\@\'\%'D "009>>@EEGN G ::3I3IDLhLh3h!iklmDL$AKIW\WcWcAdAdI$4$4$6$;$;$=js[!((5_%(,(?(?(A_% !JJ,,.Hs3x/	 /-1ZZ8Zcr'Z8DJ##~5#+/::+A+AD(OOHHT&<<''nnUAG!%!<!<W!<!Zyy&&&0+=(<<##!!%/0088DJJPTP\P\]DL $ 9s   Jc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nr  ri   )
model_name)	rX   r3  r   r   r  r  create_model_cardrw  rk  )rW   rS   r]  rx  ry  s       r\   rk  )_UnslothOnlineDPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .r_   rx  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr  unsloth_versionunslothJOB_IDhf_jobsa          @article{guo2024direct,
            title        = {{Direct Language Model Alignment from Online AI Feedback}},
            author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
            year         = 2024,
            eprint       = {arXiv:2402.04792}
        }z
Online DPOz7Direct Language Model Alignment from Online AI Feedbackz
2402.04792)rG  rx  r3  r{  r|  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerorV   rS   r  r;   pathisdirr  r  r  r  r  r  r
  r  rB   dedentr.   r3  r4   wandbrunurlr/   savejoinrX   r   )rW   rx  r{  r|  rG  citation
model_cards          r\   ry  *_UnslothOnlineDPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%%Q!

 	TYY%9%9;GHr_   )&r  r>  ra  r	  r_  r  r  r  r  r  r  r  rU  r  rW  r[  r  r  r   r  r\  r  r  r  ro  r  rX  rZ  rY  r^  rc  r  r  r  rf  rg  rd  rl  )NNNNNNNNNNNN)NNNNNr  )r   N)NNN)?r{  r|  r}  r~  r  r  r	   r    r9   r  r  r   r#   r  r   r   r   r   r   r  r!   r"   r   r   r&   r  rC   r$  	Optimizerlr_schedulerLambdaLRr   rx  propertyr  staticmethodr  r   r)  rH   r%   r;  r   rC  r  rX  rO  rP  rd  r  r  r  r  r  r  r  r  rX  ru  rk  ry  r  r  r  s   @r\   r  r  z  sn   CJ &J
 >BFJ-1*.04CGnrUYmq.2FJ59VbhlDHEI'W_bii45W "))T9:W uZj1A%ABC	W
 )*W 'W  -W  g&> ?@W uWotCwXgOgIhDh?i%ijkW #5)@.)P#QRW $,E2I4PgKh2h,i#jW l+W "(N+;T+A"BCW D12W %++//1I1I1R1RRSW  (0%,,9UW\WcWc9c0d'e!W$ u_bii%?@A%W& "**A!B'W( 
)W Wr   $ CZ _cdgildl_m    7''(Xj X )X. 7&&'-9sG|9L0M -9Yc -9 (-9^O ? _n ,H<F*P*:M*^ C : HT#Y<O  PTS%ell(:";;<	c49n	6TLl,\"J rv_EYY_E(,S%c8I2J-J(K_Eaijman_E	_EF hl(^V/ %)&*,0	?ISM?I sm?I CcD()	?I ?Ir_   r  c                   N   ^  \ rS rSrSr               SU 4S jjrSrU =r$ )UnslothOnlineDPOTraineri0  ap  
    
Initialize OnlineDPOTrainer.

Args:
    model (`Union[str, nn.Module, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
        The reference model to use for training. If None is specified, the reference model will be created from the
        model.
    judge (`BasePairwiseJudge`):
        The judge to use for pairwise comparison of model completions.
    reward_funcs (`Union[RewardFunc, list[RewardFunc]]`, *optional*, defaults to `None`):
        Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
        functions with the prompts and completions and sum the rewards. Can be either:

        - A single reward function: Can be a string (path to model), a [`~transformers.PreTrainedModel`], or a
          custom callable function.
        - A list of reward functions: Must all be of compatible types.

        Note: Only one of `judge`, or `reward_funcs` should be provided.
    args (`OnlineDPOConfig`):
        The online DPO config arguments to use for training.
    data_collator (`transformers.DataCollator`):
        The data collator to use for training. If None is specified, the default data collator
        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
        sequences in the batch, given a dataset of paired sequences.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        The dataset to use for training.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If provided, will be used to automatically process the inputs
        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
        reuse the fine-tuned model.
    reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
        Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

        - A single processing class: Used when `reward_funcs` contains only one reward function.
        - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.

        If set to `None`, the tokenizer for each model-based reward function is automatically loaded using
        [`~transformers.AutoTokenizer.from_pretrained`].
    peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.
    callbacks (`list[transformers.TrainerCallback]`):
        The callbacks to use for training.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.

.. deprecated:: 0.22.0
    The following parameters are deprecated and will be removed in a future version:

    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.

    c           
      x  > Uc
  [        5       n[        USS5      n[        U5      [        La  Sn[        USS5      n[        U5      [        La  SnSn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U(       a  [        S5      eU(       d  U(       d  U(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU(       d<  U(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n [        US'S 5      n!U!c'  U b$  UR<                  n"[;        US'5      (       a  U"Ul        Ub!  [;        US(5      (       a  UR?                  5         S)[9        5       ;   a   [;        [@        S*5      (       a  S+[@        l!        S,[9        5       ;   aU  [;        U	S*5      (       a  S+U	l!        [;        U	S)5      (       a,  [;        U	R@                  S*5      (       a  S+U	R@                  l!        S,[9        5       ;   a  U	O[@        n#SS-K"J#n$  [I        UU$5      (       dx  [I        U[J        5      (       a(  S.URL                  ;  a  [O        U#SS/[        US0S 5      S19nO[I        U[N        5      (       a%  S.URL                  ;   a  [K        U#[        US0S 5      S29nOJ[;        US35      (       a  SUl(        [;        US45      (       a  S5Ul)        [;        US65      (       a	  S7S	0Ul*        [I        UU$5      (       dx  [;        U#S85      (       dg  [;        U#S)5      (       aV  [I        U[J        5      (       a   [K        U#R@                  [        US0S 5      S29nO![O        U#R@                  SS/[        US0S 5      S19n/ n%SS9K+J,n&  U&" S:U%5        [        US;S 5      [Z        R\                  :X  a(  UR^                  S:  a  [        US<S5      S:w  a  SUl0        S=[9        5       ;   a!  [;        US(5      (       a  UR?                  5         [b        T)U ]  " SN0 S=U_S>U_S?U_S@U_SAU_SBU_SCU_SU_S,U	_SDU
_SEU_S$U_SFU_S%U_SGU_SHU_UD6  S=[9        5       ;   a!  [;        USI5      (       a  URg                  5         [;        U SJ5      (       a-  U Rh                  Rk                  5         [;        U SJ5      (       a  U ?4[        USKS 5      b  U Rl                  UR                  5       l6         [;        U SL5      (       aV  U Rn                  Rp                  n'Un([;        U(S=5      (       a&  U'U(l9        U(Rt                  n([;        U(S=5      (       a  M&  U'U(l9         [;        U SM5      (       a.  [w        [y        U Rz                  R|                  5      U 5      U l>        g )ONr  Fr  UNSLOTH_ENABLE_FULL_FINETUNING01UNSLOTH_FORCE_FLOAT32zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONru   r  torch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r   ACCELERATE_MIXED_PRECISIONr  r   r   r  r   r   ro   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r      r   r  r  r  r  r  UNSLOTH_RETURN_LOGITSr   rT   r  rb  r  r  )UnslothVisionDataCollatorlabelsr   pad_to_multiple_of)mlmmlm_probabilityr  )r  r  dataset_text_fieldr   dataset_kwargsskip_prepare_datasetr<   )PatchRLStatisticsonline_dpo_trainerparallel_mode_n_gpurS   r  r  rU  rX   r  r  r  r  r  r  r  rU   neftune_hook_handlerL  r  r  r   )?r   r  r  r  r;   r  r  rr  r  get_input_embeddingsr  unsloth_zoo.utilsr  rC   r  r  r  r  r   r  transformersr  rL   r   r   r   r  r  localsrV   r   rT   r  rb  unsloth_zoo.vision_utilsr  r  rM   column_names+TransformersDataCollatorForLanguageModelingr  r  r  unsloth_zoo.logging_utilsr  rO   NOT_DISTRIBUTEDr%  r  rw  rx  rU   r  removerL  r  scaleraccelerator_scalerrS   rP   ra   ry  r  )*rW   rS   r  r  rU  rX   r  r  r  r  r  r  r  r  r  r  r  rY   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyper  r  r  ga_stepstransformers_versioneval_bszr  r  _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr   #_UnslothOnlineDPOTrainer__tokenizerr  other_metricsr  r  current_modelry  s*                                            r\   rx   UnslothOnlineDPOTrainer.__init__w  s>   ( < 6 84/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4G!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW*<*H&iF-)BCC-)?@@XUbUoUoEo K&))07KT)R	! M+VWW\dhu  iC  iC  ]C 6)07KT)R!
 t455TYt7Qt122bD4Kt-..G]_cFd0C-)BCC;..7;3T3Tm-CDD$:#---4T;OQU-V%M
 %P#--#*--4T;OQU-V	%M ?.> 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	H	H!	H (	H 		H
 	H *	H *	H (	H  0	H )B	H &	H .	H "	H -J	H (	H  '>!	H" fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJr_   )r  )NNNNNNNNNNNNNNN)r{  r|  r}  r~  r  rx  r  r  r  s   @r\   r  r  0  sE    ER $((,"&#o or_   r  	addFilterc                        \ rS rSrS rS rSrg)HideLoggingMessagei-	  c                     Xl         g r  r_  )rW   r_  s     r\   rx  HideLoggingMessage.__init__.	  s    d)r_   c                 <    U R                   UR                  5       ;  $ r  )r_  
getMessage)rW   r  s     r\   filterHideLoggingMessage.filter/	  s    alln)DEr_   r  N)r{  r|  r}  r~  rx  r  r  r   r_   r\   r  r  -	  s    2Er_   r  z`use_cache=True`)kr  rC   r   torch.nnr9   r   r   typingr   r   r   r   r	   r
   r   r   trl.trainer.online_dpo_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r:   r;   r<   r=   r>   r?   r@   rA   rB   rD   rE   rF   rG   rH   dataclassesrJ   rK   packaging.versionrL   numpynp
contextlibr  rM   rN   r  transformers.training_argsrO   r`   typesrP   ra   torch_compile_optionscompiler   r  r   r   r   r   r   r  r  rV   Filterr  r  r   r_   r\   <module>r     sC  0    $ I I I h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h  h 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL 
 c-_ c- c-H uIw uIl-v6 vp  6;FW^^ F 	
'(:;<  r_   