
    f:i#             
       v   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJrJrJrJ
r
JrJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)JrJ*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJrJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJrJLrLJMrMJ
r
J"r"J'r'J;r;JDrDJr  SSKDrDSSK7  SSKNJOrOJPrP  SS	KQJRrR  SSKrSSKSrBSS
KTJCrC  SSKJr  SSKUJVrVJWrX  SSKYJZrZ  SSK[r[SSK\J]r]  S r^ SSSSSS.r_\R                  " SS\_S9S 5       raS\R                  S\bS\bS\R                  4S jrcS\R                  S\R                  S\bS\bS\R                  4
S jrdS\R                  S\bS\R                  4S jre\O " S  S!\5      5       rf  " S" S#\'5      rg " S$ S%\g5      rhg)&z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)GAcceleratorBaseImageProcessorCallbackHandlerDEFAULT_CALLBACKSDEFAULT_PROGRESS_CALLBACKDataCollatorWithPadding
DataLoaderDatasetExportableStateFeatureExtractionMixinGenerationConfigINVALID_LOGPROBOnlineTrainerStater   	PPOConfig
PPOTrainerPath
PeftConfig	PeftModelPolicyAndValueWrapperPreTrainedTokenizerBasePrinterCallbackProcessorMixinTrainerTrainerCallbackTrainerControlr	   batch_generation	broadcastcontextmanagercreate_reference_modeldefaultdictdisable_dropout_in_modelempty_cache	exact_divfirst_true_indicesforwardgather_objectgcgenerate_model_cardget_comet_experiment_urlget_peft_model#get_reporting_integration_callbacks
get_rewardis_peft_availableis_rich_availableis_wandb_availablelog_table_to_comet_experimentmasked_meanmasked_whitenmathnnnpnullcontextospdpeft_module_casting_to_bf16prepare_deepspeedprint_rich_tableselective_log_softmaxtextwraptimetorchtruncate_responseunwrap_model_for_generationr   r   r#   r7   rA   rI   )*)	dataclassfield)Version)r@   )DataCollatorForSeq2SeqDataCollatorForLanguageModeling)ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrrV   rW   rX   )selfargskwargsoutputfs       >/home/james-whalen/unsloth_compiled_cache/UnslothPPOTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)r^   r`   s   ` r_   prepare_for_training_modere   /   s%    __Q  Nrb   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rp   indexrp      )rI   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrq   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             r_   chunked_selective_log_softmaxr   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYrb   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
rs   z8logits_to_keep must be smaller than the sequence length.Nrr   )rv   
ValueErrorsum)r   r   r   prompt_sectionpadding_maskpad_token_countss         r_   calculate_pad_tokens_in_promptr   W   sX     ++STTq"2N?"223N"2L#''A'.rb   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
devicer   rs   )rv   r   rI   aranger{   )r   r   r   r   
batch_sizecompletion_lenr   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               r_    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.Jrb   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
rs   T)rp   
descendingstable)rI   argsortrz   )r   r   masksorted_indicespacked_tensors        r_   left_pack_paddingr      s8     D]]4Q4MNLLN;Mrb   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'                                                                                                                                                                      SU 4S jjrSrU =r$ )UnslothPPOConfig   aV
  
    
Configuration class for the [`PPOTrainer`].

This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
values in this class may differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
        Name of this experiment.
    reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
        Path to the reward model.
    model_adapter_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the train target PEFT adapter, when using LoRA with multiple adapters.
    ref_adapter_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the reference PEFT adapter, when using LoRA with multiple adapters.
    num_ppo_epochs (`int`, *optional*, defaults to `4`):
        Number of epochs to train.
    whiten_rewards (`bool`, *optional*, defaults to `False`):
        Whether to whiten the rewards.
    kl_coef (`float`, *optional*, defaults to `0.05`):
        KL coefficient.
    kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`):
        Which estimator for KL-Divergence to use from [Approximating KL
        Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased
        estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly
        better estimator". Cannot be set to "k2", as it is used for logging purposes.
    cliprange (`float`, *optional*, defaults to `0.2`):
        Clip range.
    vf_coef (`float`, *optional*, defaults to `0.1`):
        Value function coefficient.
    cliprange_value (`float`, *optional*, defaults to `0.2`):
        Clip range for the value function.
    gamma (`float`, *optional*, defaults to `1.0`):
        Discount factor.
    lam (`float`, *optional*, defaults to `0.95`):
        Lambda value for GAE.
    ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
        This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
        improving generation speed. However, disabling this option allows training models that exceed the VRAM
        capacity of a single GPU, albeit at the cost of slower generation.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsrm   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksc                   > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#Wc$  S
SKJn  [        [	        U" 5       S-   S5      S5      nWS
::  a  [        S5      eWS:  a  [        S5      e[        TU ]  " S0 SU_SU_SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_S U_S!U_S"U_S#U_S$U_S%U_S&U_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U _S2U!_S3U"_S4U#_S5U$_S6U%_S7U&_S8U'_S9U(_S:U)_S;U*_S<U+_S=U,_S>U-_S?U._S@U/_SAU0_SBU1_SCU2_SDU3_SEU4_SFU5_SGU6_SHU7_SIU8_SJU9_SKU:_SLU;_SMU<_SNU=_SOU>_SPU?_SQW@_SRWA_SSWB_STWC_SUWD_SVWE_SWWF_SXWG_SYWH_SZWI_S[WJ_S\WK_S]WL_S^WM_S_WN_S`WO_SaWP_SbWQ_ScWR_SdWS_SeWT_SfWU_SgWV_ShWW_SiWX_SjWY_SkWZ_SlW[_SmW\_SnW]_SoW^_SpW__SqW`_SrWa_SsWb_StWc_SuWd_SvWe_SwWf_SxWg_SyWh_SzWi_S{Wj_S|Wk_S}Wl_S~Wm_SWn_SWo_SWp_SWq_SWr_SWs_SWt_SWu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l	        g )NHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rs   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!steps  unsloth_training_checkpointsnor   )	cpu_countrn      @   zUUnsloth: Please set a positive non-zero temperature since your results will be wrong.
   zgUnsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16fp16fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesdataset_num_procnum_mini_batchestotal_episodes local_rollout_forward_batch_sizenum_sample_generationsresponse_length
stop_tokenstop_token_idtemperaturemissing_eos_penaltysft_model_path
world_sizenum_total_batchesmicro_batch_sizelocal_batch_sizer   local_mini_batch_sizemini_batch_sizeexp_namereward_model_pathmodel_adapter_nameref_adapter_namenum_ppo_epochswhiten_rewardskl_coefkl_estimator	cliprangevf_coefcliprange_valuegammalamds3_gather_for_generation )
printmultiprocessingr   minmax	MathErrorsuper__init__r   r   )rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rN  rO  r   rP  rQ  rR  rS  rT  rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r   r   r\   r   	__class__s                                                                                                                                                                         r_   rg  UnslothPPOConfig.__init__   s	   P 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M#1"3y{1}a#8"=!sttB  F  G  G 	 a	L#a	L#7a	L  a	L 	a	L
 $a	L *a	L $8a	L +Fa	L *Da	L (@a	L '>a	L +Fa	L '>a	L $a	L '>a	L  *!a	L" (#a	L$ $%a	L& $'a	L( ()a	L* *+a	L,  0-a	L. "/a	L0 !21a	L2 (3a	L4 (5a	L6 "7a	L8 !29a	L:  0;a	L< &=a	L>  0?a	L@ "4Aa	LB *Ca	LD &<Ea	LF *Ga	LH $Ia	LJ  0Ka	LL  0Ma	LN !2Oa	LP .Qa	LR 7^Sa	LT Ua	LV Wa	LX ,Ya	LZ [a	L\ "]a	L^ *_a	L`  aa	Lb ca	Ld ea	Lf ,ga	Lh &<ia	Lj ,ka	Ll ,ma	Ln oa	Lp $qa	Lr &sa	Lt *ua	Lv !2wa	Lx ya	Lz $8{a	L| $}a	L~ &<a	L@ *DAa	LB $Ca	LD  Ea	LF (Ga	LH %:Ia	LJ &Ka	LL &<Ma	LN %:Oa	LP !2Qa	LR  0Sa	LT Ua	LV #6Wa	LX &Ya	LZ 2T[a	L\ "4]a	L^ "4_a	L` "aa	Lb &<ca	Ld ea	Lf $ga	Lh "ia	Lj .ka	Ll "4ma	Ln "oa	Lp *Dqa	Lr !2sa	Lt %:ua	Lv %:wa	Lx -Jya	Lz #6{a	L| *D}a	L~ &a	L@ &<Aa	LB (Ca	LD (Ea	LF "Ga	LH  0Ia	LJ .Ka	LL (Ma	LN &<Oa	LP -JQa	LR *DSa	LT &<Ua	LV (Wa	LX $8Ya	LZ (@[a	L\ !2]a	L^ *_a	L` $8aa	Lb  0ca	Ld &ea	Lf "ga	Lh &ia	Lj *ka	Ll %:ma	Ln "4oa	Lp )Bqa	Lr -Jsa	Lt #6ua	Lv $8wa	Lx "4ya	Lz *{a	L|  0}a	L~ #6a	L@ &<Aa	LB -JCa	LD  0Ea	LF  0Ga	LH ,Ia	LJ 0PKa	LL &<Ma	LN .Oa	LP $Qa	LR *Sa	LT &Ua	LV #6Wa	LX ,Ya	LZ $[a	L\ !2]a	L^  0_a	L`  0aa	Lb $ca	Ld %:ea	Lf .ga	Lh  ia	Lj !2ka	Ll "4ma	Ln  0oa	Lp ,qa	Lr ,sa	Lt ua	Lv (wa	Lx "ya	Lz {a	L| .}a	L~ a	L@ Aa	LB )BFCa	LD %9!"4rb   )r   r   )NNFFFr   Frn   rn   NNr   r   r      g-C6
?g{Gz?g?g+?g:0yE>      ?g      @rm   linear皙?r   passivewarningTNr   Frs   Fr   r   NTFFFFFFO  rp  FFFFO1autoFFNrm   NNF FNr   Nrm   NNTNFNNFrs  r   NNNNN        
adamw_8bitNFFlengthNNNNTFTFFNN
every_saveNNFNTNFTrr  NNNrs  FFNlasti  FNNFFNNFFFNFTNrs   Nr   r   5   NNgffffff?NEleutherAI/pythia-160mNNNNNNN
ppo_configrz  NNrn   Fg?k1皙?rm  r}  rk  gffffff?TNrm   )__name__
__module____qualname____firstlineno____doc__rN   r   r   r   __annotations__r   intrg  __static_attributes____classcell__rh  s   @r_   r   r      sI   /` +012+(3-  */VW*#  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) $!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(,+-!#"1  $4!$(#IY5 Y5rb   r   c                   .  ^  \ rS rSrSrSS/r     S$S\S\\\	\
\\4      S\R                  S	\\R                     S
\R                  S\S\R                  S\\   S\\\\\\4   4      S\\R*                  R,                  \R*                  R.                  R0                  4   S\\\      S\S   SS4S jjrS\4S jrS\4S jr\S 5       r S%S\\   S\!4U 4S jjjr"S r#S&S\!4S jjr$U 4S jr%   S'S\\   S \\   S!\\\\   S4   4S" jjr&S#r'U =r($ )(_UnslothPPOTraineri&  a  Trainer for Proximal Policy Optimization (PPO).

For details on PPO, see the paper: [Proximal Policy Optimization
Algorithms](https://huggingface.co/papers/1707.06347).

Args:
    args ([`PPOConfig`]):
        Training arguments.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
        Class to process the data.
    model (`torch.nn.Module`):
        Model to be trained. This is the policy model.
    ref_model (`torch.nn.Module`, *optional*):
        Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
    reward_model (`torch.nn.Module`):
        Reward model used to compute the rewards.
    train_dataset ([`~datasets.Dataset`]):
        Dataset for training.
    value_model (`torch.nn.Module`):
        Value model used to predict the value of a state.
    data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
        Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
        using the `processing_class`.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
        optimizer and the learning rate scheduler are created using the
        [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
        will be wrapped with the specified PEFT adapter.
trlppoNr[   processing_classrV   	ref_modelreward_modeltrain_datasetvalue_modeldata_collatoreval_dataset
optimizers	callbackspeft_configr   r   c                    XCL a  [        S5      eXl        X l        X0l        Uc  [	        U R                  5      nUR
                  (       a  UR                  (       a  [        S5      eUR
                  (       aV  UR
                  S:X  a-  UR                  =U R                  R                  l        U l        OE[        SUR
                   S35      eUR                  =U R                  R                  l        U l        U R                  R                  S;  a  [        S5      e[        5       (       d  Ub  [        S5      e[        5       (       a  Ub  [        U R                  [        5      (       a  U R                  R                  5       U l        [        U R                  U5      U l        UR                   (       a1  [#        U R                  S	S
5      (       a  [%        U R                  5        [        5       =(       a    [        U R                  [        5      U l        UR(                  U l        UR*                  U l        U(       a  X@l        O3U R&                  (       a  S U l        O[/        U R                  5      U l        XPl        X`l        [5        U5      U l        Xpl        Xl        Xl        U
u  U l        U l         S U l!        URD                  c'  [G        URH                  U R6                  -  5      Ul"        [K        URL                  S9nXl'        URP                  Ul)        URT                  URL                  -  Ul+        [G        URT                  URR                  -  5      Ul,        [G        URV                  URR                  -  5      Ul-        []        URZ                  UR^                  S5      Ul0        []        URV                  UR^                  S5      Ul1        URd                  (       a%  URb                  S:  d   SURb                   S35       e[f        Rh                  " URD                  URZ                  -  5      Ul5        [l        Rn                  " [G        [p        Rp                  " 5       5      URr                  S9n[u        US5      Rw                  5       nURx                   SURz                   SU 3Ul>        URz                  UR~                  S-  -   U l@        UR                  S:  a(  [        SURj                  UR                  -  5      U lC        URV                  U lD        U R                  U R,                  U R8                  U R0                  4 H  nUc  M  [        U5        M     [        U R                  U R8                  5      U lG        U R                  R                  U R                  lH        U R                  URj                  S9  [        [        U R                  R                  5      -   nUc  UOUU-   U lM        [        U R                  U R                  U R                  U R>                  U R@                  5      U lO        U R                  U R                  R                  (       a  [        O[        5        [        5       U lU        [        U R                  5       U R                  5       U R                  R                  U R                  /-    Vs/ s H  n[        U[        5      (       d  M  UPM     snS9U lZ        SU l[        S U l\        [#        U RN                  R                  SS 5      S LU l]        [#        U RN                  R                  SS 5      S LU l^        S U l_        U R                  R                  (       a  U R                  5         U R                  R                  (       a)  [        R                  " U R                  R                  SS9  [        U R                  S5      (       a%  U R                  R                  U R                  5        [        U R2                  U R                  SU R:                  SS9U lj        [l        R                  " URz                  5        UR                  U R                  U R>                  U R                  5      u  U lG        U l        U lj        [l        R                  " U R                  5        [        U R<                  UR                  U R:                  SS9U ln        UR                  U R                  5      U ln        U R                  (       a  [        U R0                  URT                  UR                  UR                   5      U l        U R,                  c  U R&                  (       d  [        S5      eg [        U R,                  URT                  UR                  UR                   5      U l        g U R,                  c  U R&                  (       d  [        S5      eO4U R,                  R                  U RN                  Rr                  5      U l        U R0                  R                  U RN                  Rr                  5      U l        g s  snf ) Nz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, you must make a copy of it, or `None` if you use peft.z5You cannot set both `stop_token` and `stop_token_id`.eoszUnknown `stop_token` z9. Allowed values are: `'eos'` and `None` (no stop token).>   r|  k3zkl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, appears to be a strictly better estimator). See [Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details.zvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsis_loaded_in_4bitF)r   z5`batch_size` must be a multiple of `num_mini_batches`z;`local_batch_size` must be a multiple of `num_mini_batches`   zPer-rank minibatch size z is insufficient for whiteningr   r   __i rs   )num_training_steps)is_local_process_zerois_world_process_zerostateful_callbacksdeepspeed_pluginfsdp_pluginT)exist_okadd_model_tags)r   shuffle
collate_fn	drop_last)r   r  r  z1No reference model and model is not a Peft model.)rr   r[   r  policy_modelr   rG  rH  eos_token_idgeneration_configrY  r7   ImportError
isinstancer   merge_and_unloadr4   r   getattrrC   is_peft_modelrT  rU  r  r)   r  r  lentrain_dataset_lenr  r  r  	optimizerlr_scheduleroptimizer_cls_and_kwargsrC  r  r   r   r   acceleratornum_processesrL  r   rO  rN  r   r-   rB  rQ  rP  rW  r=   ceilrM  rI   r   rH   r   r'   itemrR  r   r  process_index
local_seedrE  rd  sample_generations_freqlocal_dataloader_batch_sizer+   r   rV   configcreate_optimizer_and_schedulerr   r5   r  r  r   callback_handleradd_callbackr  r!   r   r%   controlr   r  r  r   statecurrent_floshp_search_backendis_deepspeed_enabledis_fsdp_enabledr   r  init_hf_reposhould_saverA   makedirsr   rY   r  
_tag_namesr   
dataloadermanual_seedpreparer   eval_dataloaderrD   r   rx   )rZ   r[   r  rV   r  r  r  r  r  r  r  r  r  r  time_tensortime_intmoduledefault_callbackscbs                      r_   rg  _UnslothPPOTrainer.__init__M  sF   $ Z 
 	 0!  3D4I4IJM ??t11TUU__%'XhXuXuu!!33@4CU +DOO+<<uv  UYTfTffD//<t?Q 99!!5d  !""{'> I    [%<$++Y77$($5$5$F$F$H! !/t/@/@+ NDyyWT%6%68KUSS+D,=,=>.0]Z@Q@QS\5]"&"9"9 $ 5 5&N!DN3D4E4EFDN(*!$]!3&*(,6))(,%
 &"%d&;&;d>T>T&T"UD!d>^>^_&%33 $ @ @4CcCc c #D$D$Dt$V Wd33dooEF(OOT224k 
 &/!!4#8#8:w&
" --2 *4+E+E*FFde2
 "&$//1"
 ll3tyy{#3K<N<NO[!,113==/DII;b
C))k&?&?&&HH&&*+.q$2H2HDLgLg2g+hD(+/+@+@(
 (($..$:J:JDL]L]^F!(0 _ +4+<+<d>N>NO
 --44

++#55 	, 	
 .0STXT]T]TgTg0hh.7.?*EVYbEb /NNDJJ(=(=t~~tO`O`!
 	TYY-C-C/Ibc%''"&"<"<">"&"<"<">!22<<~M MrQ[\^`oQpM 

 !%$+D,<,<,B,BDVX\$]ei$i!&t'7'7'='=}dS[__ 99  99  KK		,,t< 4::/00JJ%%doo6
 %77))
 	$))$6A6I6I$**VZVdVdfjfufu6v3
DNDO$//*)66))	 
  +2243G3GH$$ 1!!4#C#CTYYPTPYPY!D ~~%))$%XYY * "3NND$D$DdiiQUQZQZ" ~~%))$%XYY * "&!2!243C3C3J3J!K $ 1 1 4 4T5E5E5L5L MD{ s   i.ic                     U R                   $ Nr  rZ   s    r_   get_train_dataloader'_UnslothPPOTrainer.get_train_dataloader  s    rb   c                     U R                   $ r  )r  r  s    r_   get_eval_dataloader&_UnslothPPOTrainer.get_eval_dataloader"  s    ###rb   c              #   ,  #    U R                   (       aN  U R                  (       d=  U R                  R                  U R                  R
                  5      R                  5       O	[        5          U R                  (       a/  U R                  R
                  R                  U R                  5        Sv   U R                  (       a8  U R                  R
                  R                  U R                  =(       d    S5        SSS5        g! , (       d  f       g= f7f)zWContext manager for handling null reference model (that is, peft adapter manipulation).Nr   )
r  rU  r  unwrap_modelrV   policydisable_adapterr@   set_adapterrT  r  s    r_   null_ref_context#_UnslothPPOTrainer.null_ref_context%  s     
 !!$*?*? ))$***;*;<LLN $$

!!--d.C.CD$$

!!--d.E.E.RS  s   A*D,BD:	D
DDr   _internal_callc                   > U R                   nU R                   R                  U l         U R                  (       a  U R                  nU R                   U l        [        TU ]  X5        X0l         U R                  (       a  WU l        g g r  )rV   r  r  r  rf  
save_model)rZ   r   r  backup_modelbackup_deepspeedrh  s        r_   r  _UnslothPPOTrainer.save_model3  sb    zzZZ&&
$$#~~!ZZDN:6!
$$-DN %rb   c                 ,  ^r U R                   nU R                  nU R                  nU R                  nU R                  nU R
                  nU R                  nU R                  mrUR                  nUr4S jn	[        U	" 5       5      n
[        UR                  UR                  S-   SSSS9nUR                  S5        [        R                  " 5       nUR                  UR                   UR"                  4n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9nUR)                  5         S	U R*                  l        S	U R*                  l        UR0                  U R*                  l        UR4                  U R6                  -  U R*                  l        UR:                  br  UR:                  S
:  aG  [<        R>                  " U R*                  R2                  UR:                  -  5      U R*                  l        OUR:                  U R*                  l        UR@                  br  UR@                  S
:  aG  [<        R>                  " U R*                  R2                  UR@                  -  5      U R*                  l         OUR@                  U R*                  l         URB                  br  URB                  S
:  aG  [<        R>                  " U R*                  R2                  URB                  -  5      U R*                  l!        OURB                  U R*                  l!        U RD                  RG                  XR*                  U RH                  5      U l$        U RJ                  (       a"  U R                  U l&        U R                  U l'        [Q        S
UR0                  S
-   5       GH]  nU R*                  =R.                  S
URR                  -  -  sl        [U        U
5      n[$        RV                  " 5          US   RY                  U5      nURZ                  S
   n/ n/ n/ n/ n/ n/ n/ n[]        U R                  U R                  U R                   R^                  S9 n [a        U Rb                  UURd                  URf                  U5      u  n!n"S S S 5        [Q        S	URZ                  S	   URd                  5       GH>  n#UU#U#URd                  -    n$W!U#U#URd                  -    n%U%S S 2US 24   n&W"U#U#URd                  -    n'[i        U'U&5      n(A'[k        5         Uc;  U Rm                  5          [o        URb                  U%URf                  5      n)S S S 5        O[o        UU%URf                  5      n)W)Rp                  S S 2US
-
  S24   n*U*UR                  S-   -  n*[i        U*U&5      n+A)A*[k        5         U&n,U Rr                  b!  [u        U Rr                  URf                  U&5      n,[$        Rv                  " U$U,4S
5      n-[y        U,URf                  :H  5      S
-
  n.UR{                  U5      R|                  n/[        U/U%URf                  U5      u  n0  n1U0S S 2US
-
  S24   R                  S5      n2[        UU-URf                  U5      u  n1n3n1UR                  U&5        UR                  U,5        UR                  U(5        UR                  U+5        UR                  U.5        UR                  U35        UR                  U25        GMA     [$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      nA(A+A0A2A3A [k        5         [        R                  " 5         [$        R                  " UU R                  R                  :H  SS9n4U R                   R                  b"  UU4) ==   U R                   R                  -  ss'   [$        R                  " URZ                  S
   UR                  S9R                  URZ                  S	   S
5      n5U5UR                  S
5      :  n6[$        R                  " UU6[        5      n[$        R                  " UU6[        5      nUS
-   n7U5U7R                  S
5      :  n8[$        R                  " UU8S	5      nUU-
  n9UR                  S:X  a  U9* OU9R                  5       S
-
  U9-
  n:UR                  * U:-  n;U;R                  5       n<[$        R                  " U<R                  S	5      U<R                  S9n=[$        R                  " U7U<R                  S
5      :  U7U5      n>U<U=U>/==   U-  ss'   UR                  (       a$  [        U<U8) SS9n<[$        R                  " U<U8S	5      n<S	n?/ n@URZ                  S
   nA[        [Q        UA5      5       Hm  nBUBWAS
-
  :  a  US S 2WBS
-   4   OSnCU<S S 2WB4   UR                  UC-  -   US S 2UB4   -
  nDUDUR                  UR                  -  U?-  -   n?W@R                  U?5        Mo     [$        R                  " W@S S S2   S
S9nEUEU-   nF[        UEU6) 5      nE[$        R                  " UEU6S	5      nE[k        5         S S S 5        [Q        UR                  5       GHR  nG[        R                  R                  UR                  5      nHS	nI[Q        S	UR                  UR                  5       GH  nJUJUR                  -   nKWHUJUK nLS	nM[Q        S	UR                  UR                  5       GH  nNUR                  U5         UNUR                  -   nOWLUNUO nPWEUP   nQWUP   nRW!UP   nSWUP   nTWFUP   nUWUP   nV[o        UUSURf                  5      u  nWnXUWRp                  S S 2WS
-
  S24   n'U'UR                  S-   -  n'[i        U'UR5      nY[$        R                  " UYW6UP   [        5      nYUXS S 2US
-
  S24   R                  S5      nZ[$        R                  " UZW8UP   S	5      nZ[$        R                  " UZUVUR                  -
  UVUR                  -   5      n[[$        R                  " UZUU-
  5      n\[$        R                  " U[UU-
  5      n][$        R                  " U\U]5      n^S[        U^U8UP   ) 5      -  n_[        U]U\:  R                  5       U8UP   ) 5      n`UYUT-
  na[$        R                  " Ua5      nbUQ* Ub-  ncUQ* [$        R                  " UbSUR                  -
  SUR                  -   5      -  nd[$        R                  " UcUd5      ne[        UeU6UP   ) 5      nfUfUR                  U_-  -   ngUR                  Ug5        UR                  5         UR                  5         [$        RV                  " 5          [        UdUc:  R                  5       U6UP   ) 5      nh[$        R                  R                  R                  U'S[$        R                  S9RY                  U'R                  5      ni[$        R                  " U'SS9[$        R                  " UiU'-  SS9-
  njSUaS-  R                  5       -  nkUkUWGWIWM4'   UhUUGUIUM4'   UfUUGUIUM4'   U_UUGUIUM4'   U`UUGUIUM4'   UjR                  5       UUGUIUM4'   UbR                  5       UUGUIUM4'   S S S 5        S S S 5        WMS
-  nMGM     WIS
-  nIAWAXA'AYAZA[A\A]A_A`AaAbAcAdAeAfAgAhAiAjAkAUAQAVARASAT[k        5         GM      GMU     [$        RV                  " 5          W:R                  S
5      R                  5       nlW* R                  S
5      R                  5       nmW;R                  S
5      R                  5       nnUnWR                  5       -   no[        U R*                  R.                  [        R                  " 5       U-
  -  5      np0 nqUpUqS'   U R                  R                  Ul5      R                  5       R                  5       UqS'   U R                  R                  Um5      R                  5       R                  5       UqS'   U R                  R                  Un5      R                  5       R                  5       UqS'   U R                  R                  Uo5      R                  5       R                  5       UqS'   U R                  R                  UR                  5       5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS '   U R                  R                  U5      R                  5       R                  5       UqS!'   U R                  R                  U5      R                  5       R                  5       UqS"'   U R                  R                  U5      R                  5       R                  5       UqS#'   WUR                  :H  R                  5       R                  5       UqS$'   U R                  R                  5       S	   UqS%'   U R*                  R.                  UqS&'   U R*                  R.                  U R6                  -  U R*                  lx        U R*                  =R,                  S
-  sl        U R                  Uq5        S S S 5        U R                  R                  5         U RD                  R                  XR*                  U RH                  5      U l$        U RH                  R                  (       aP  U R                  US S'9  U RD                  R                  U R                   U R*                  U RH                  5      U l$        A:AlAmAnAAqA;[k        5         [        R                  " 5         UR                  S	:  a0  US
-
  U R                  -  S	:X  a  U GR                  SS(9  [k        5         A!AAAAAAA4A7A5A6A8A<A=A>AEAF[k        5         GM`     U RD                  GR                  XR*                  U RH                  5      U l$        U RH                  R                  (       aQ  U R                  US S'9  U RD                  R                  U R                   U R*                  U RH                  5      U l$        g g ! , (       d  f       GN&= f! , (       d  f       GNc= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNL= f))Nc               3   *   >#     T  S h  vN   M   N7fr  r`  r  s   r_   repeat_generator2_UnslothPPOTrainer.train.<locals>.repeat_generatorM  s     %%% %s   r   rt  rk  Tmax_new_tokensrI  top_ktop_p	do_samplez===training policy===r   r   rs   r   gather_deepspeed3_paramsrm   rr   r|  F)r   
shift_mean)axisg      ?)rp   dtyper   epszobjective/klzobjective/entropyzobjective/non_score_rewardzobjective/rlhf_rewardzobjective/scoreszpolicy/approxkl_avgzpolicy/clipfrac_avgzloss/policy_avgzloss/value_avgzval/clipfrac_avgzpolicy/entropy_avgz	val/ratiozval/ratio_varzval/num_eos_tokenslrepisode)trial)sampling)r[   r  r  rV   r  r  r  r  r   iterr   rF  rI  ra  rH   rV  rB  r   rI   zerostrainr  global_stepr  rM  r   rC  r  r   r   r=   r  r   r   r  on_train_beginr  r  r  model_wrappedranger   nextno_gradrx   rv   rK   r_  r&   r  rD  r   rF   r,   r  r/   r   rH  rJ   catr.   r  r  r6   r|   r~   r1   collectanyr  rJ  r   repeatr{   masked_fillr   rY  exprX  clonesizewhererW  r<   reversedr]  r^  stackr?   randompermutationrO  rP  r   
accumulateclampr\  squarerd  r;   floatrZ  r[  backwardstep	zero_gradr>   r   softmaxry   r  r}   r   meanr  gather_for_metricsr  varr  get_last_lrepochlogon_step_endr  _save_checkpointon_saverE  r  generate_completionson_train_end)srZ   r[   r  r  rV   
ref_policyr  r  r   r  iter_dataloaderr  
start_timestats_shapeapproxkl_statspg_clipfrac_statspg_loss_statsvf_loss_statsvf_clipfrac_statsentropy_statsratio_statsupdatedataqueriescontext_length	responsespostprocessed_responseslogprobsref_logprobsscoressequence_lengthsvaluesunwrapped_modelquery_responseslogitssiqueryquery_responseresponser   logprob
ref_output
ref_logitsref_logprobpostprocessed_responsepostprocessed_query_responsesequence_lengthunwrapped_value_model
full_value_valuescorecontain_eos_tokenresponse_idxsr   sequence_lengths_p1padding_mask_p1logrklnon_score_rewardrewardsactual_start
actual_end
lastgaelamadvantages_reversed
gen_lengtht
nextvaluesdelta
advantagesreturnsppo_epoch_idxb_indsminibatch_idxmini_batch_startmini_batch_endmini_batch_indsgradient_accumulation_idxmicro_batch_startmicro_batch_endmicro_batch_indsmb_advantagemb_responsesmb_query_responsesmb_logprobs	mb_return	mb_valuesr]   
vpred_tempnew_logprobsvpredvpredclipped
vf_losses1
vf_losses2vf_loss_maxvf_lossvf_clipfraclogprobs_diffratio	pg_losses
pg_losses2pg_loss_maxpg_losslosspg_clipfrac	prob_distentropyapproxklmean_klmean_entropymean_non_score_rewardrlhf_rewardr  metricsr  ss                                                                                                                     @r_   r  _UnslothPPOTrainer.trainB  s   yy&&NN	

^^
((00__
##	& /12,//))D0
 	12YY[
**D,A,A4CcCcd[@!KKCK?K?!KKCK?kk+= "#



#55

&*&9&9D<R<R&R

#)!!A%+/99TZZ5I5IDL^L^5^+_

(+/+=+=

(??&"(,		$**2F2F2X(Y

%(,

%??&"(,		$**2F2F2X(Y

%(,

%,,;;D**dll[ $$!ZZDN!%DAt559:FJJ!doo"55(D{+..v6!(q!1	*,'!#% 0JJ 0 0499KnKn$/?'..==(55)0,OW q'--"2D4Y4YZA#AD,Q,Q(QRE%4QT=b=b9b%cN-a.@AH$QT-R-R)RSF3FHEGM!)!224)0~O_OlOl)mJ 54 &-ZIYIfIf%g
!+!2!21nq6H26M3M!NJ$"2"2T"99J"7
H"MK"JM .6*))51B ..0@0M0Mx2.
 4999eE[=\^_3`0&89OScSpSp9p&qtu&uO,7,D,DU,K,W,W)'1-~?O?\?\^l($J1 'q.1*<r*A'ABJJ2NE",$&BDTDaDacq#KAua $$X.+223IJOOG, ''4$++O<MM%(MM%([ [\ "IIi3	*/))4KQ*O' 99Xq1$yyq9#(99-=q#A 61-61-k:ue_

 %*II.EI^I^IkIk.kqs$t!9900<--.$))2O2OO. !&Y__Q-?	HXHX Y ` `ajapapqrasuv w,/?/I/I!/LL ,,X|_U$00|_]&6&:#"/3F3P3PQR3S"T**6?AF $h."//47dUdhhj1nPT=T$(LL=2#5 *002$||GLLOGNNS"[[)<w||A)NPceuv
z23v=3 &&+G?:JW\]G#//!LG 
&(#&__Q/
!%
"34A56a5G1q5!1SJ#AqDMDJJ,CCfQPQTlRE!&dhh)>)K!KJ'..z:	 5
 #[[)<TrT)BK
$v-*:}E
"..z<K
u !z "'t':':!;..t/D/DE !(-a1F1FHbHb(c$%58R8R%RN&,-=n&MO01--21d6P6PRVRrRr-s)(33E:.?$BbBb.bO/>?PQ`/a,+56F+GL+45E+FL1@AQ1R.*23C*DK(/0@(AI(./?(@I18@RTdTqTq1r.FJ%+]]1nq6H26M3M%NF"d&6&6&==F+@+VL+0+<+< ,l;K.Lo,L %/q.12Dr2I/I$J$R$RSU$VE$)$5$5e_M]=^`a$bE+0;; % )D,@,@ @ )D,@,@ @,L
 */ei6G)HJ).lY6N)OJ*/))J
*KK&)KoVfFgEg,h&hG*5!+j!8 ? ? AOTdDeCe+K -9;,FM$)IIm$<E)5(=I*6UCRVR`R`L`behlhvhvbv9w)wJ*/))Iz*JK&1+M]@^?^&_G#*T\\G-C#CD'006%NN,%//1!&.9%/)%;$B$B$D|TdGeFe/" -2HH,?,?,G,GTV`e`m`m,G,n,q,qrxr~r~,	*///&b*IEIIV_bhVhnpLq*q+.-2B1H1H1J+Jjr}mMf/f g$/ !2-Pi2i j jqm]Le.e fipm]Le.e f$/ !2-Pi2i j jqiuiuiwm]Le.e fglgqgqgsM=Jc,c d# "1S ;v 2Q61y .tz "Q&M 
FL%"JmUZ\egqs~{IwR[$i?QS^  MW )d "<^ &&)..*!)	q1668(8(<(<Q(?(D(D(F%3fkkmC$**,,		j0HIJ!$*.*:*:*M*Mg*V*[*[*]*b*b*d'/3/?/?/R/RS_/`/e/e/g/l/l/n+,$$778MNSSUZZ\ 45 483C3C3V3VWb3c3h3h3j3o3o3q/0.2.>.>.Q.QRXR]R]R_.`.e.e.g.l.l.n*+151A1A1T1TUc1d1i1i1k1p1p1r-.151A1A1T1TUf1g1l1l1n1s1s1u-.-1-=-=-P-PQ^-_-d-d-f-k-k-m)*,0,<,<,O,OP],^,c,c,e,j,j,l().2.>.>.Q.QRc.d.i.i.k.p.p.r*+040@0@0S0STa0b0g0g0i0n0n0p,-'+'7'7'J'J;'W'\'\'^'c'c'e$+/+;+;+N+N{+[+_+_+a+f+f+h(1:>N>[>[1[0`0`0b0g0g0i,- $ 1 1 = = ? B%)ZZ%7%7	"#'::#5#58N8N#N

 

&&!+&!9 !< ""$00<<T::t||\DL||''%%e4%8#44<<TYY

TXT`T`aG\+@&'ScMJJL**Q.FQJ$B^B^3^bc3c))4)8' !#MY ;^ ,,99$

DLLY<<##!!%t!40088DJJPTP\P\]DL $G * 54? !\ "1S ;:N !s    A%AX%1AW5BAX1"AX	U4AXIAX=#D
AX+-AX=Q:AYW5
AXW?AXX
AXXAXX
AX(	X+
AX:X5AX=X=
AYY
AY	r  c                    U R                   nU R                  n[        U R                   R                  SSSSS9n[	        [
        5      n[        U R                  U R                  U R                   R                  S9 nU R                   GH  nUS   n[        R                  " 5          UR                  S   n	[        UR                  UUR                  S	   UR                   U5      u  pU
S S 2U	S 24   nUnU R"                  b!  [%        U R"                  UR                   U5      nUS
   R'                  [)        UR+                  USS95      5        US   R'                  [)        UR+                  U5      5      5        [        R,                  " X4S5      n[/        U R0                  XR                   U	5      u  pnUS   R'                  U R                  R3                  U5      R5                  5       R7                  5       R9                  5       5        S S S 5        U(       d  GM    O   S S S 5        [:        R<                  " U5      nU R                  R>                  (       a  [A        5       (       a  [C        URD                  S	S 5        SURF                  ;   a1  S	S K$nURJ                  b   URM                  SURO                  US905        SURF                  ;   a  [Q        SUS9  g g g ! , (       d  f       N= f! , (       d  f       N= f)Ngaz?rt  rk  Tr  r  r   rs   r   r@  )skip_special_tokenszmodel responserN     wandbcompletions)	dataframecomet_mlzcompletions.csv)nametable))r[   r  r   rF  r*   listrK   rV   r  r_  r  rI   r  rv   r&   r  r   rH  rJ   extendr0   batch_decoder  r6   r  r  r  cpunumpyrB   	DataFrameis_main_processr8   rE   ilocr  r  runr   Tabler:   )rZ   r  r[   r  r  r  r<  batchr@  r4  rA  rL  rB  rG  rH  rN  dfr  s                     r_   r$  '_UnslothPPOTrainer.generate_completions  s   yy00,9944$
 D!(JJ((499CfCf
--k*]]_%*[[^N(8'..A(55))%N  .a.@AH-5*))51B ..0@0M0Mx2. 'N))%&6&C&CE_c&C&de *+22%&6&C&CDZ&[\ 4999e=\^_3`0",))+GIfIfhv#KAa 'N))$*:*:*M*Me*T*Z*Z*\*`*`*b*h*h*jk5 %8 8? .
F \\% ++ "" U!34$..(99(II}ekkBk.GHIT^^+-* , ,A %_
 
s+   <+K3'E'K"K3 K3"
K0,K33
Lc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )N/rm   )
model_name)	r[   r   r   r   r  splitcreate_model_cardrf  r"  )rZ   rV   r  r  rh  s       r_   r"  #_UnslothPPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .rb   r  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsa          @article{mziegler2019fine-tuning,
            title        = {{Fine-Tuning Language Models from Human Preferences}},
            author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
            year         = 2019,
            eprint       = {arXiv:1909.08593}
        }PPOz2Fine-Tuning Language Models from Human Preferencesz
1909.08593)
base_modelr  r   r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)r  rY   rV   r  rA   pathisdirr  setr  straddenvironr1  r  rG   dedentr2   r   r9   r  r  urlr3   savejoinr[   r   )rZ   r  r  r  r  citation
model_cards          r_   r  $_UnslothPPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%L!

 	TYY%9%9;GHrb   )#r  r[   r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rV   rT  r  r  r  r  r  rU  r  r  r  r  rH  r  r  r  )NN)NNNN)NF)F)NNN))r~  r  r  r  r  r  r   r   r	   r    r   r   r"   r>   Moduler   r   dictr  tuplerI   r  	Optimizerr  LambdaLRr  r$   rg  r   r  r  r(   r  boolr  r  r$  r"  r  r  r  r  s   @r_   r  r  &  s   "H J <@EIVb59.2!PNPN #)+=?UWeef
PN yyPN BII&PN iiPN PN YYPN   78PN uWd3<.@%@ABPN %++//1I1I1R1RRSPN D12PN  l+!PN" 
#PNdj $Z $ T T.Xc] .4 . .R^h
>T >B/ %)&*,0	@ISM@I sm@I CcD()	@I @Irb   r  c                   8   ^  \ rS rSrSr    SU 4S jjrSrU =r$ )UnslothPPOTraineri   a  
    Trainer for Proximal Policy Optimization (PPO).

For details on PPO, see the paper: [Proximal Policy Optimization
Algorithms](https://huggingface.co/papers/1707.06347).

Args:
    args ([`PPOConfig`]):
        Training arguments.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
        Class to process the data.
    model (`torch.nn.Module`):
        Model to be trained. This is the policy model.
    ref_model (`torch.nn.Module`, *optional*):
        Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
    reward_model (`torch.nn.Module`):
        Reward model used to compute the rewards.
    train_dataset ([`~datasets.Dataset`]):
        Dataset for training.
    value_model (`torch.nn.Module`):
        Value model used to predict the value of a state.
    data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
        Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
        using the `processing_class`.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
        optimizer and the learning rate scheduler are created using the
        [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
        will be wrapped with the specified PEFT adapter.

    c                 0  > Uc
  [        5       n[        USS5      n[        U5      [        La  Sn[        USS5      n[        U5      [        La  SnSn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U(       a  [        S5      eU(       d  U(       d  U(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU(       d<  U(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n[        US'S 5      nUc'  Ub$  UR<                  n[;        US'5      (       a  UUl        Ub!  [;        US(5      (       a  UR?                  5         S)[9        5       ;   a   [;        [@        S*5      (       a  S+[@        l!        S,[9        5       ;   aU  [;        US*5      (       a  S+Ul!        [;        US)5      (       a,  [;        UR@                  S*5      (       a  S+UR@                  l!        S,[9        5       ;   a  UO[@        nSS-K"J#n  [I        UU5      (       dx  [I        U[J        5      (       a(  S.URL                  ;  a  [O        USS/[        US0S 5      S19nO[I        U[N        5      (       a%  S.URL                  ;   a  [K        U[        US0S 5      S29nOJ[;        US35      (       a  SUl(        [;        US45      (       a  S5Ul)        [;        US65      (       a	  S7S	0Ul*        [I        UU5      (       dx  [;        US85      (       dg  [;        US)5      (       aV  [I        U[J        5      (       a   [K        UR@                  [        US0S 5      S29nO![O        UR@                  SS/[        US0S 5      S19n/ n SS9K+J,n!  U!" S:U 5        [        US;S 5      [Z        R\                  :X  a(  UR^                  S:  a  [        US<S5      S:w  a  SUl0        S=[9        5       ;   a!  [;        US(5      (       a  UR?                  5         [b        T$U ]  " SDUUUUUUUUU	U
US>.UD6  S=[9        5       ;   a!  [;        US?5      (       a  URg                  5         [;        U S@5      (       a-  U Rh                  Rk                  5         [;        U S@5      (       a  U ?4[        USAS 5      b  U Rl                  UR                  5       l6         [;        U SB5      (       aV  U Rn                  Rp                  n"Un#[;        U#S=5      (       a&  U"U#l9        U#Rt                  n#[;        U#S=5      (       a  M&  U"U#l9         [;        U SC5      (       a.  [w        [y        U Rz                  R|                  5      U 5      U l>        g )ENr   Fr   UNSLOTH_ENABLE_FULL_FINETUNING01UNSLOTH_FORCE_FLOAT32zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONry   r  torch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r   ACCELERATE_MIXED_PRECISIONr  r   r   r   rm  r   rs   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r   r  r   r   r   bfloat16compute_metricspreprocess_logits_for_metricsUNSLOTH_RETURN_LOGITSmax_seq_lengthrW   	tokenizerpadding_siderightr  )UnslothVisionDataCollatorlabelsrt  pad_to_multiple_of)mlmmlm_probabilityr  )r  r  dataset_text_fieldrs  dataset_kwargsskip_prepare_datasetpad)PatchRLStatisticsppo_trainerparallel_mode_n_gpurV   )r[   r  rV   r  r  r  r  r  r  r  r  rX   neftune_hook_handler9  r  r  r`  )?r   r  typer  rA   r  getra  r  get_input_embeddingsr  unsloth_zoo.utilsr  rI   float16	TypeErrorr   r   r   r   transformersr  rO   r   r   r   r   r   localsrY   r  rW   r  r  unsloth_zoo.vision_utilsr  r  rP   column_names+TransformersDataCollatorForLanguageModelingr  r  r  unsloth_zoo.logging_utilsr  rR   NOT_DISTRIBUTEDn_gpur  rf  rg  rX   r  remover9  r  scaleraccelerator_scalerrV   rS   re   rh  r  )%rZ   r[   r  rV   r  r  r  r  r  r  r  r  r\   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyper  r  r  ga_stepstransformers_versioneval_bszr   r   _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr  _UnslothPPOTrainer__tokenizerr  other_metricsr  r  current_modelrh  s%                                       r_   rg  UnslothPPOTrainer.__init__F  s    < 0 24/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4G!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW*<*H&iF-)BCC-)?@@XUbUoUoEo K&))07KT)R	! M+VWW\dhu  iC  iC  ]C 6)07KT)R!
 t455TYt7Qt122bD4Kt-..G]_cFd0C-)BCC;..7;3T3Tm-CDD$:#---4T;OQU-V%M
 %P#--#*--4T;OQU-V	%M ?-7 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	0/!')%)'!%	0 )/	0 fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJrb   )r  )NNNN)r~  r  r  r  r  rg  r  r  r  s   @r_   r  r     s#    $\ e erb   r  )ir  rI   r   torch.nnr>   r   Ftypingr   r   r   r   r	   r
   r   r   trl.trainer.ppo_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rJ   rK   dataclassesrM   rN   packaging.versionrO   r  
contextlibr  rP   rQ   r  transformers.training_argsrR   rc   typesrS   re   torch_compile_optionscompiler   r  r   r   r   r   r  r  r`  rb   r_   <module>r     s  0    $ I I I k  k  k  k  k  k  k  k  k  k  k  k  k  k  k  k  k  k  k 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL  S5y S5 S5j yI yItK* KZ rb   