
    h                     8   S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	  S SK
Jr  S SKJrJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJrJr  S SKJr  S S	KJr  S S
KJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  S SK*J+r+  S SK,J-r-J.r.  S SK/J0r0J1r1J2r2  S SK3J4r4J5r5  SSK6J7r7J8r8  SSK9J:r:  SSK;J<r<  SSK=J>r>  SSK?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrLJMrMJNrNJOrO  \4" 5       (       a
  S SKPJQrQJRrRJSrS  \)" 5       (       a  S SKTrTSrU " S S\R                  5      rW " S S\&5      rXg)    N)defaultdict)contextmanagernullcontext)Path)OptionalUnion)Accelerator)	broadcastgather_object)Dataset)
DataLoader)
BaseImageProcessorDataCollatorWithPaddingFeatureExtractionMixinGenerationConfigPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackTrainerControlis_wandb_available)#get_reporting_integration_callbacks)DEFAULT_CALLBACKSDEFAULT_PROGRESS_CALLBACK)CallbackHandlerExportableStatePrinterCallback)is_peft_availableis_rich_available   )masked_meanmasked_whiten)create_reference_model)unwrap_model_for_generation   )	PPOConfig)OnlineTrainerStatebatch_generationdisable_dropout_in_modelempty_cache	exact_divfirst_true_indicesforwardgenerate_model_cardget_comet_experiment_url
get_rewardlog_table_to_comet_experimentpeft_module_casting_to_bf16prepare_deepspeedprint_rich_tableselective_log_softmaxtruncate_response)
PeftConfig	PeftModelget_peft_model      ?c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )PolicyAndValueWrapperV   c                    > [         TU ]  5         Xl        X l        [	        X"R
                  5      U l        UR                  U l        g N)super__init__policyvalue_modelgetattrbase_model_prefixcritic_backboneis_gradient_checkpointing)selfrB   rC   	__class__s      Q/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/ppo_trainer.pyrA   PolicyAndValueWrapper.__init__W   s:    &&{4Q4QR)/)I)I&    c                     U R                   " S0 UD6nU R                  R                  UR                  S   5      nU R                  " S0 UD6U4$ )N )rF   rC   scorehidden_statesrB   )rH   kwargsoutputlogitss       rJ   r-   PolicyAndValueWrapper.forward^   sM    %%//!!''(<(<R(@A{{$V$f,,rL   )rF   rG   rB   rC   )returnN)__name__
__module____qualname____firstlineno__rA   r-   __static_attributes____classcell__rI   s   @rJ   r<   r<   V   s    J- -rL   r<   c                   .  ^  \ rS rSrSrSS/r     S$S\S\\\	\
\\4      S\R                  S	\\R                     S
\R                  S\S\R                  S\\   S\\\\\\4   4      S\\R*                  R,                  \R*                  R.                  R0                  4   S\\\      S\S   SS4S jjrS\4S jrS\4S jr\S 5       r S%S\\   S\!4U 4S jjjr"S r#S&S\!4S jjr$U 4S jr%   S'S\\   S \\   S!\\\\   S4   4S" jjr&S#r'U =r($ )(
PPOTrainerd   a  Trainer for Proximal Policy Optimization (PPO).

For details on PPO, see the paper: [Proximal Policy Optimization
Algorithms](https://huggingface.co/papers/1707.06347).

Args:
    args ([`PPOConfig`]):
        Training arguments.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`]):
        Class to process the data.
    model (`torch.nn.Module`):
        Model to be trained. This is the policy model.
    ref_model (`torch.nn.Module`, *optional*):
        Reference model used to compute the KL divergence. If `None`, a copy of the policy model is created.
    reward_model (`torch.nn.Module`):
        Reward model used to compute the rewards.
    train_dataset ([`~datasets.Dataset`]):
        Dataset for training.
    value_model (`torch.nn.Module`):
        Value model used to predict the value of a state.
    data_collator ([`~transformers.DataCollatorWithPadding`], *optional*):
        Data collator to batch and pad samples from the dataset. If `None`, a default data collator is created
        using the `processing_class`.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training. If `None`, the
        optimizer and the learning rate scheduler are created using the
        [`~transformers.Trainer.create_optimizer_and_scheduler`] method.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the policy `model`
        will be wrapped with the specified PEFT adapter.
trlppoNargsprocessing_classmodel	ref_modelreward_modeltrain_datasetrC   data_collatoreval_dataset
optimizers	callbackspeft_configr7   rV   c                    XCL a  [        S5      eXl        X l        X0l        Uc  [	        U R                  5      nUR
                  (       a  UR                  (       a  [        S5      eUR
                  (       aV  UR
                  S:X  a-  UR                  =U R                  R                  l        U l        OE[        SUR
                   S35      eUR                  =U R                  R                  l        U l        U R                  R                  S;  a  [        S5      e[        5       (       d  Ub  [        S5      e[        5       (       a  Ub  [        U R                  [        5      (       a  U R                  R                  5       U l        [        U R                  U5      U l        UR                   (       a1  [#        U R                  S	S
5      (       a  [%        U R                  5        [        5       =(       a    [        U R                  [        5      U l        UR(                  U l        UR*                  U l        U(       a  X@l        O3U R&                  (       a  S U l        O[/        U R                  5      U l        XPl        X`l        [5        U5      U l        Xpl        Xl        Xl        U
u  U l        U l         S U l!        URD                  c'  [G        URH                  U R6                  -  5      Ul"        [K        URL                  S9nXl'        URP                  Ul)        URT                  URL                  -  Ul+        [G        URT                  URR                  -  5      Ul,        [G        URV                  URR                  -  5      Ul-        []        URZ                  UR^                  S5      Ul0        []        URV                  UR^                  S5      Ul1        URd                  (       a%  URb                  S:  d   SURb                   S35       e[f        Rh                  " URD                  URZ                  -  5      Ul5        [l        Rn                  " [G        [p        Rp                  " 5       5      URr                  S9n[u        US5      Rw                  5       nURx                   SURz                   SU 3Ul>        URz                  UR~                  S-  -   U l@        UR                  S:  a(  [        SURj                  UR                  -  5      U lC        URV                  U lD        U R                  U R,                  U R8                  U R0                  4 H  nUc  M  [        U5        M     [        U R                  U R8                  5      U lG        U R                  R                  U R                  lH        U R                  URj                  S9  [        [        U R                  R                  5      -   nUc  UOUU-   U lM        [        U R                  U R                  U R                  U R>                  U R@                  5      U lO        U R                  U R                  R                  (       a  [        O[        5        [        5       U lU        [        U R                  5       U R                  5       U R                  R                  U R                  /-    Vs/ s H  n[        U[        5      (       d  M  UPM     snS9U lZ        SU l[        S U l\        [#        U RN                  R                  SS 5      S LU l]        [#        U RN                  R                  SS 5      S LU l^        S U l_        U R                  R                  (       a  U R                  5         U R                  R                  (       a)  [        R                  " U R                  R                  SS9  [        U R                  S5      (       a%  U R                  R                  U R                  5        [        U R2                  U R                  SU R:                  SS9U lj        [l        R                  " URz                  5        UR                  U R                  U R>                  U R                  5      u  U lG        U l        U lj        [l        R                  " U R                  5        [        U R<                  UR                  U R:                  SS9U ln        UR                  U R                  5      U ln        U R                  (       a  [        U R0                  URT                  UR                  UR                   5      U l        U R,                  c  U R&                  (       d  [        S5      eg [        U R,                  URT                  UR                  UR                   5      U l        g U R,                  c  U R&                  (       d  [        S5      eO4U R,                  R                  U RN                  Rr                  5      U l        U R0                  R                  U RN                  Rr                  5      U l        g s  snf ) Nz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, you must make a copy of it, or `None` if you use peft.z5You cannot set both `stop_token` and `stop_token_id`.eoszUnknown `stop_token` z9. Allowed values are: `'eos'` and `None` (no stop token).>   k1k3zkl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, appears to be a strictly better estimator). See [Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details.zvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsis_loaded_in_4bitF)gradient_accumulation_stepsz5`batch_size` must be a multiple of `num_mini_batches`z;`local_batch_size` must be a multiple of `num_mini_batches`   zPer-rank minibatch size z is insufficient for whiteningdevicer   __i r%   )num_training_steps)is_local_process_zerois_world_process_zerostateful_callbacksdeepspeed_pluginfsdp_pluginT)exist_okadd_model_tags)
batch_sizeshuffle
collate_fn	drop_last)r   r   r   z1No reference model and model is not a Peft model.)r
ValueErrorrc   rd   policy_modelr   
stop_tokenstop_token_ideos_token_idgeneration_configkl_estimatorr   ImportError
isinstancer8   merge_and_unloadr9   bf16rD   r2   is_peft_modelmodel_adapter_nameref_adapter_namerf   r#   rg   rh   lentrain_dataset_lenrC   ri   rj   	optimizerlr_scheduleroptimizer_cls_and_kwargstotal_episodesintnum_train_epochsr	   rs   acceleratornum_processes
world_sizeper_device_train_batch_sizelocal_batch_sizemicro_batch_sizer   r+   num_mini_batchesmini_batch_sizelocal_mini_batch_sizewhiten_rewardsmathceilnum_total_batchestorchtensortimerv   r
   itemexp_nameseedrun_nameprocess_index
local_seednum_sample_generationsmaxsample_generations_freqlocal_dataloader_batch_sizer)   r<   re   configcreate_optimizer_and_schedulerr   r   	report_torl   r   callback_handleradd_callbackdisable_tqdmr   r   r   controlr'   ry   rz   r   statecurrent_floshp_search_backendis_deepspeed_enabledis_fsdp_enabledhub_model_idpush_to_hubinit_hf_reposhould_saveosmakedirs
output_dirhasattrr   
_tag_namesr   
dataloadermanual_seedprepareper_device_eval_batch_sizeeval_dataloaderr3   fp16to)rH   rc   rd   re   rf   rg   rh   rC   ri   rj   rk   rl   rm   r   time_tensortime_intmoduledefault_callbackscbs                      rJ   rA   PPOTrainer.__init__   sF   $ Z 
 	 0!  3D4I4IJM ??t11TUU__%'XhXuXuu!!33@4CU +DOO+<<uv  UYTfTffD//<t?Q 99!!5d  !""{'> I    [%<$++Y77$($5$5$F$F$H! !/t/@/@+ NDyyWT%6%68KUSS+D,=,=>.0]Z@Q@QS\5]"&"9"9 $ 5 5&N!DN3D4E4EFDN(*!$]!3&*(,6))(,%
 &"%d&;&;d>T>T&T"UD!d>^>^_&%33 $ @ @4CcCc c #D$D$Dt$V Wd33dooEF(OOT224k 
 &/!!4#8#8:w&
" --2 *4+E+E*FFde2
 "&$//1"
 ll3tyy{#3K<N<NO[!,113==/DII;b
C))k&?&?&&HH&&*+.q$2H2HDLgLg2g+hD(+/+@+@(
 (($..$:J:JDL]L]^F!(0 _ +4+<+<d>N>NO
 --44

++#55 	, 	
 .0STXT]T]TgTg0hh.7.?*EVYbEb /NNDJJ(=(=t~~tO`O`!
 	TYY-C-C/Ibc%''"&"<"<">"&"<"<">!22<<~M MrQ[\^`oQpM 

 !%$+D,<,<,B,BDVX\$]ei$i!&t'7'7'='=}dS[__ 99  99  KK		,,t< 4::/00JJ%%doo6
 %77))
 	$))$6A6I6I$**VZVdVdfjfufu6v3
DNDO$//*)66))	 
  +2243G3GH$$ 1!!4#C#CTYYPTPYPY!D ~~%))$%XYY * "3NND$D$DdiiQUQZQZ" ~~%))$%XYY * "&!2!243C3C3J3J!K $ 1 1 4 4T5E5E5L5L MD{ s   i.ic                     U R                   $ r?   r   rH   s    rJ   get_train_dataloaderPPOTrainer.get_train_dataloader]  s    rL   c                     U R                   $ r?   )r   r   s    rJ   get_eval_dataloaderPPOTrainer.get_eval_dataloader`  s    ###rL   c              #   ,  #    U R                   (       aN  U R                  (       d=  U R                  R                  U R                  R
                  5      R                  5       O	[        5          U R                  (       a/  U R                  R
                  R                  U R                  5        Sv   U R                  (       a8  U R                  R
                  R                  U R                  =(       d    S5        SSS5        g! , (       d  f       g= f7f)zWContext manager for handling null reference model (that is, peft adapter manipulation).Ndefault)
r   r   r   unwrap_modelre   rB   disable_adapterr   set_adapterr   r   s    rJ   null_ref_contextPPOTrainer.null_ref_contextc  s     
 !!$*?*? ))$***;*;<LLN $$

!!--d.C.CD$$

!!--d.E.E.RS  s   A*D,BD:	D
DDr   _internal_callc                   > U R                   nU R                   R                  U l         U R                  (       a  U R                  nU R                   U l        [        TU ]  X5        X0l         U R                  (       a  WU l        g g r?   )re   rB   r   	deepspeedr@   
save_model)rH   r   r   backup_modelbackup_deepspeedrI   s        rJ   r   PPOTrainer.save_modelq  sb    zzZZ&&
$$#~~!ZZDN:6!
$$-DN %rL   c                 n,  ^r U R                   nU R                  nU R                  nU R                  nU R                  nU R
                  nU R                  nU R                  mrUR                  nUr4S jn	[        U	" 5       5      n
[        UR                  UR                  S-   SSSS9nUR                  S5        [        R                  " 5       nUR                  UR                   UR"                  4n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9n[$        R&                  " XS9nUR)                  5         S	U R*                  l        S	U R*                  l        UR0                  U R*                  l        UR4                  U R6                  -  U R*                  l        UR:                  br  UR:                  S
:  aG  [<        R>                  " U R*                  R2                  UR:                  -  5      U R*                  l        OUR:                  U R*                  l        UR@                  br  UR@                  S
:  aG  [<        R>                  " U R*                  R2                  UR@                  -  5      U R*                  l         OUR@                  U R*                  l         URB                  br  URB                  S
:  aG  [<        R>                  " U R*                  R2                  URB                  -  5      U R*                  l!        OURB                  U R*                  l!        U RD                  RG                  XR*                  U RH                  5      U l$        U RJ                  (       a"  U R                  U l&        U R                  U l'        [Q        S
UR0                  S
-   5       GH4  nU R*                  =R.                  S
URR                  -  -  sl        [U        U
5      n[$        RV                  " 5          US   RY                  U5      nURZ                  S
   n/ n/ n/ n/ n/ n/ n/ n[]        U R                  U R                  U R                   R^                  S9 n [a        U Rb                  UURd                  URf                  U5      u  n!n"S S S 5        [Q        S	URZ                  S	   URd                  5       GH>  n#UU#U#URd                  -    n$W!U#U#URd                  -    n%U%S S 2US 24   n&W"U#U#URd                  -    n'[i        U'U&5      n(A'[k        5         Uc;  U Rm                  5          [o        URb                  U%URf                  5      n)S S S 5        O[o        UU%URf                  5      n)W)Rp                  S S 2US
-
  S24   n*U*UR                  S-   -  n*[i        U*U&5      n+A)A*[k        5         U&n,U Rr                  b!  [u        U Rr                  URf                  U&5      n,[$        Rv                  " U$U,4S
5      n-[y        U,URf                  :H  5      S
-
  n.UR{                  U5      R|                  n/[        U/U%URf                  U5      u  n0  n1U0S S 2US
-
  S24   R                  S5      n2[        UU-URf                  U5      u  n1n3n1UR                  U&5        UR                  U,5        UR                  U(5        UR                  U+5        UR                  U.5        UR                  U35        UR                  U25        GMA     [$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      n[$        Rv                  " US	5      nA(A+A0A2A3A [k        5         [        R                  " 5         [$        R                  " UU R                  R                  :H  SS9n4U R                   R                  b"  UU4) ==   U R                   R                  -  ss'   [$        R                  " URZ                  S
   UR                  S9R                  URZ                  S	   S
5      n5U5UR                  S
5      :  n6[$        R                  " UU6[        5      n[$        R                  " UU6[        5      nUS
-   n7U5U7R                  S
5      :  n8[$        R                  " UU8S	5      nUU-
  n9UR                  S:X  a  U9* OU9R                  5       S
-
  U9-
  n:UR                  * U:-  n;U;R                  5       n<[$        R                  " U<R                  S	5      U<R                  S9n=[$        R                  " U7U<R                  S
5      :  U7U5      n>U<U=U>/==   U-  ss'   UR                  (       a$  [        U<U8) SS9n<[$        R                  " U<U8S	5      n<S	n?/ n@URZ                  S
   nA[        [Q        UA5      5       Hm  nBUBWAS
-
  :  a  US S 2WBS
-   4   OSnCU<S S 2WB4   UR                  UC-  -   US S 2UB4   -
  nDUDUR                  UR                  -  U?-  -   n?W@R                  U?5        Mo     [$        R                  " W@S S S2   S
S9nEUEU-   nF[        UEU6) 5      nE[$        R                  " UEU6S	5      nE[k        5         S S S 5        [Q        UR                  5       GH*  nG[        R                  R                  UR                  5      nHS	nI[Q        S	UR                  UR                  5       GH  nJUJUR                  -   nKWHUJUK nLS	nM[Q        S	UR                  UR                  5       GHk  nNUR                  U5         UNUR                  -   nOWLUNUO nPWEUP   nQWUP   nRW!UP   nSWUP   nTWFUP   nUWUP   nV[o        UUSURf                  5      u  nWnXUWRp                  S S 2WS
-
  S24   n'U'UR                  S-   -  n'[i        U'UR5      nY[$        R                  " UYW6UP   [        5      nYUXS S 2US
-
  S24   R                  S5      nZ[$        R                  " UZW8UP   S	5      nZ[$        R                  " UZUVUR                  -
  UVUR                  -   5      n[[$        R                  " UZUU-
  5      n\[$        R                  " U[UU-
  5      n][$        R                  " U\U]5      n^S[        U^U8UP   ) 5      -  n_[        U]U\:  R                  5       U8UP   ) 5      n`UYUT-
  na[$        R                  " Ua5      nbUQ* Ub-  ncUQ* [$        R                  " UbSUR                  -
  SUR                  -   5      -  nd[$        R                  " UcUd5      ne[        UeU6UP   ) 5      nfUfUR                  U_-  -   ngUR                  Ug5        UR                  5         UR                  5         [$        RV                  " 5          [        UdUc:  R                  5       U6UP   ) 5      nh[$        R                  R                  R                  U'SS9ni[$        R                  " U'SS9[$        R                  " UiU'-  SS9-
  njSUaS-  R                  5       -  nkUkUWGWIWM4'   UhUUGUIUM4'   UfUUGUIUM4'   U_UUGUIUM4'   U`UUGUIUM4'   UjR                  5       UUGUIUM4'   UbR                  5       UUGUIUM4'   S S S 5        S S S 5        WMS
-  nMGMn     WIS
-  nIAWAXA'AYAZA[A\A]A_A`AaAbAcAdAeAfAgAhAiAjAkAUAQAVARASAT[k        5         GM     GM-     [$        RV                  " 5          W:R                  S
5      R                  5       nlW* R                  S
5      R                  5       nmW;R                  S
5      R                  5       nnUnWR                  5       -   no[        U R*                  R.                  [        R                  " 5       U-
  -  5      np0 nqUpUqS'   U R                  R                  Ul5      R                  5       R                  5       UqS'   U R                  R                  Um5      R                  5       R                  5       UqS'   U R                  R                  Un5      R                  5       R                  5       UqS'   U R                  R                  Uo5      R                  5       R                  5       UqS'   U R                  R                  UR                  5       5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS'   U R                  R                  U5      R                  5       R                  5       UqS '   U R                  R                  U5      R                  5       R                  5       UqS!'   U R                  R                  U5      R                  5       R                  5       UqS"'   WUR                  :H  R                  5       R                  5       UqS#'   U R                  R                  5       S	   UqS$'   U R*                  R.                  UqS%'   U R*                  R.                  U R6                  -  U R*                  lv        U R*                  =R,                  S
-  sl        U R                  Uq5        S S S 5        U R                  R                  5         U RD                  R                  XR*                  U RH                  5      U l$        U RH                  R                  (       aP  U R                  US S&9  U RD                  R                  U R                   U R*                  U RH                  5      U l$        A:AlAmAnAAqA;[k        5         [        R                  " 5         UR                  S	:  a/  US
-
  U R                  -  S	:X  a  U R                  SS'9  [k        5         A!AAAAAAA4A7A5A6A8A<A=A>AEAF[k        5         GM7     U RD                  R                  XR*                  U RH                  5      U l$        U RH                  R                  (       aQ  U R                  US S&9  U RD                  R                  U R                   U R*                  U RH                  5      U l$        g g ! , (       d  f       GN= f! , (       d  f       GN9= f! , (       d  f       G
N= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNJ= f)(Nc               3   *   >#     T  S h  vN   M   N7fr?   rO   r   s   rJ   repeat_generator*PPOTrainer.train.<locals>.repeat_generator  s     %%% %s   gHz>        r:   Tmax_new_tokenstemperaturetop_ktop_p	do_samplez===training policy===ru   r   r%   	input_idsgather_deepspeed3_paramsrN   )dimrp   F)mask
shift_mean)axisg      ?r    epszobjective/klzobjective/entropyzobjective/non_score_rewardzobjective/rlhf_rewardzobjective/scoreszpolicy/approxkl_avgzpolicy/clipfrac_avgzloss/policy_avgzloss/value_avgzval/clipfrac_avgzpolicy/entropy_avgz	val/ratiozval/ratio_varzval/num_eos_tokenslrepisode)trial)sampling)rc   r   r   re   rf   rg   rd   r   rv   iterr   response_lengthr   printr   num_ppo_epochsr   rs   r   zerostrainr   global_stepr   r   	max_stepsr   r   r   logging_stepsr   r   
eval_steps
save_stepsr   on_train_beginr   r   r   model_wrappedranger   nextno_gradr   shaper$   ds3_gather_for_generationr(   rB    local_rollout_forward_batch_sizepad_token_idr5   r*   r   r-   rT   r   r6   catr,   r   rC   r0   squeezeappendgccollectanyr   missing_eos_penaltyarangerepeat	unsqueezemasked_fillINVALID_LOGPROBr   expkl_coefclonesizewherer   r"   reversedgammalamstacknprandompermutationr   r   r   
accumulateclampcliprange_valuesquarer   r!   float	cliprangevf_coefbackwardstep	zero_gradnn
functionalsoftmax	logsumexpsummeanr   gather_for_metricsr   varr   get_last_lrepochlogon_step_endr   _save_checkpointon_saver   r   generate_completionson_train_end)srH   rc   r   r   re   
ref_policyrg   rd   rv   r   iter_dataloaderr   
start_timestats_shapeapproxkl_statspg_clipfrac_statspg_loss_statsvf_loss_statsvf_clipfrac_statsentropy_statsratio_statsupdatedataqueriescontext_length	responsespostprocessed_responseslogprobsref_logprobsscoressequence_lengthsvaluesunwrapped_modelquery_responseslogitssiqueryquery_responseresponserT   logprob
ref_output
ref_logitsref_logprobpostprocessed_responsepostprocessed_query_responsesequence_lengthunwrapped_value_model
full_value_valuerP   contain_eos_tokenresponse_idxspadding_masksequence_lengths_p1padding_mask_p1logrklnon_score_rewardrewardsactual_start
actual_end
lastgaelamadvantages_reversed
gen_lengtht
nextvaluesdelta
advantagesreturnsppo_epoch_idxb_indsminibatch_idxmini_batch_startmini_batch_endmini_batch_indsgradient_accumulation_idxmicro_batch_startmicro_batch_endmicro_batch_indsmb_advantagemb_responsesmb_query_responsesmb_logprobs	mb_return	mb_valuesrS   
vpred_tempnew_logprobsvpredvpredclipped
vf_losses1
vf_losses2vf_loss_maxvf_lossvf_clipfraclogprobs_diffratio	pg_losses
pg_losses2pg_loss_maxpg_losslosspg_clipfrac	prob_distentropyapproxklmean_klmean_entropymean_non_score_rewardrlhf_rewardr   metricsr   ss                                                                                                                     @rJ   r  PPOTrainer.train  s   yy&&NN	

^^
((00__
##	& /12,//))D0
 	12YY[
**D,A,A4CcCcd[@!KKCK?K?!KKCK?kk+= "#



#55

&*&9&9D<R<R&R

#)!!A%+/99TZZ5I5IDL^L^5^+_

(+/+=+=

(??&"(,		$**2F2F2X(Y

%(,

%??&"(,		$**2F2F2X(Y

%(,

%,,;;D**dll[ $$!ZZDN!%DAt559:FJJ!doo"55(D{+..v6!(q!1	*,'!#% 0JJ 0 0499KnKn$/?'..==(55)0,OW q'--"2D4Y4YZA#AD,Q,Q(QRE%4QT=b=b9b%cN-a.@AH$QT-R-R)RSF3FHEGM!)!224)0~O_OlOl)mJ 54 &-ZIYIfIf%g
!+!2!21nq6H26M3M!NJ$"2"2T"99J"7
H"MK"JM .6*))51B ..0@0M0Mx2.
 4999eE[=\^_3`0&89OScSpSp9p&qtu&uO,7,D,DU,K,W,W)'1-~?O?\?\^l($J1 'q.1*<r*A'ABJJ2NE",$&BDTDaDacq#KAua $$X.+223IJOOG, ''4$++O<MM%(MM%([ [\ "IIi3	*/))4KQ*O' 99Xq1$yyq9#(99-=q#A 61-61-k:ue_

 %*II.EI^I^IkIk.kqs$t!9900<--.$))2O2OO. !&Y__Q-?	HXHX Y ` `ajapapqrasuv w,/?/I/I!/LL ,,X|_U$00|_]&6&:#"/3F3P3PQR3S"T**6?AF $h."//47dUdhhj1nPT=T$(LL=2#5 *002$||GLLOGNNS"[[)<w||A)NPceuv
z23v=3 &&+G?:JW\]G#//!LG 
&(#&__Q/
!%
"34A56a5G1q5!1SJ#AqDMDJJ,CCfQPQTlRE!&dhh)>)K!KJ'..z:	 5
 #[[)<TrT)BK
$v-*:}E
"..z<K
u !z "'t':':!;..t/D/DE !(-a1F1FHbHb(c$%58R8R%RN&,-=n&MO01--21d6P6PRVRrRr-s)(33E:.?$BbBb.bO/>?PQ`/a,+56F+GL+45E+FL1@AQ1R.*23C*DK(/0@(AI(./?(@I18@RTdTqTq1r.FJ%+]]1nq6H26M3M%NF"d&6&6&==F+@+VL+0+<+< ,l;K.Lo,L %/q.12Dr2I/I$J$R$RSU$VE$)$5$5e_M]=^`a$bE+0;; % )D,@,@ @ )D,@,@ @,L
 */ei6G)HJ).lY6N)OJ*/))J
*KK&)KoVfFgEg,h&hG*5!+j!8 ? ? AOTdDeCe+K -9;,FM$)IIm$<E)5(=I*6UCRVR`R`L`behlhvhvbv9w)wJ*/))Iz*JK&1+M]@^?^&_G#*T\\G-C#CD'006%NN,%//1!&.9%/)%;$B$B$D|TdGeFe/" -2HH,?,?,G,GTV,G,W	*///&b*IEIIV_bhVhnpLq*q+.-2B1H1H1J+Jjr}mMf/f g$/ !2-Pi2i j jqm]Le.e fipm]Le.e f$/ !2-Pi2i j jqiuiuiwm]Le.e fglgqgqgsM=Jc,c d# "1S ;v 2Q61y .tz "Q&M 
FL%"JmUZ\egqs~{IwR[$i?QS^  MW )d "<^ &&)..*!)	q1668(8(<(<Q(?(D(D(F%3fkkmC$**,,		j0HIJ!$*.*:*:*M*Mg*V*[*[*]*b*b*d'/3/?/?/R/RS_/`/e/e/g/l/l/n+,$$778MNSSUZZ\ 45 483C3C3V3VWb3c3h3h3j3o3o3q/0.2.>.>.Q.QRXR]R]R_.`.e.e.g.l.l.n*+151A1A1T1TUc1d1i1i1k1p1p1r-.151A1A1T1TUf1g1l1l1n1s1s1u-.-1-=-=-P-PQ^-_-d-d-f-k-k-m)*,0,<,<,O,OP],^,c,c,e,j,j,l().2.>.>.Q.QRc.d.i.i.k.p.p.r*+040@0@0S0STa0b0g0g0i0n0n0p,-'+'7'7'J'J;'W'\'\'^'c'c'e$+/+;+;+N+N{+[+_+_+a+f+f+h(1:>N>[>[1[0`0`0b0g0g0i,- $ 1 1 = = ? B%)ZZ%7%7	"#'::#5#58N8N#N

 

&&!+&!9 !< ""$00<<T::t||\DL||''%%e4%8#44<<TYY

TXT`T`aG\+@&'ScMJJL**Q.FQJ$B^B^3^bc3c))4)8' !#MY ;^ ,,99$

DLLY<<##!!%t!40088DJJPTP\P\]DL $G * 54? !\ "1S ;:N !s    A%AW/%1AWBAW/1"AW	U4AW/IAX#C"AXAX(Q:AX%W
AWWAW/W
AW,W'AW/W/
AW>	X
AXXAXX
AX"X%
AX4	r  c                    U R                   nU R                  n[        U R                   R                  SSSSS9n[	        [
        5      n[        U R                  U R                  U R                   R                  S9 nU R                   GH  nUS   n[        R                  " 5          UR                  S   n	[        UR                  UUR                  S	   UR                   U5      u  pU
S S 2U	S 24   nUnU R"                  b!  [%        U R"                  UR                   U5      nUS
   R'                  [)        UR+                  USS95      5        US   R'                  [)        UR+                  U5      5      5        [        R,                  " X4S5      n[/        U R0                  XR                   U	5      u  pnUS   R'                  U R                  R3                  U5      R5                  5       R7                  5       R9                  5       5        S S S 5        U(       d  GM    O   S S S 5        [:        R<                  " U5      nU R                  R>                  (       a  [A        5       (       a  [C        URD                  S	S 5        SURF                  ;   a3  S	S K$nURJ                  b"  URL                  " SURN                  " US905        SURF                  ;   a  [Q        SUS9  g g g ! , (       d  f       N= f! , (       d  f       N= f)Ngaz?r   r:   Tr   r   r   r%   r   rc  )skip_special_tokenszmodel responserP      wandbcompletions)	dataframecomet_mlzcompletions.csv)nametable))rc   rd   r   r  r   listr$   re   r   r  r   r   r  r  r(   rB   r  r   r6   extendr   batch_decoder  r0   rg   r?  r3  cpunumpypd	DataFrameis_main_processr   r4   ilocr   r  runrC  Tabler1   )rH   r  rc   rd   r   r  r_  batchrc  rW  rd  ro  re  rj  rk  rP   dfr  s                     rJ   rG  PPOTrainer.generate_completions  s   yy00,9944$
 D!(JJ((499CfCf
--k*]]_%*[[^N(8'..A(55))%N  .a.@AH-5*))51B ..0@0M0Mx2. 'N))%&6&C&CE_c&C&de *+22%&6&C&CDZ&[\ 4999e=\^_3`0",))+GIfIfhv#KAa 'N))$*:*:*M*Me*T*Z*Z*\*`*`*b*h*h*jk5 %8 8? .
F \\% ++ "" U!34$..(99(II}ekkB.GHIT^^+-* , ,A %_
 
s+   <+K5'E'K$K5 K5$
K2.K55
Lc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )N/rN   )
model_name)	rc   r   r   r   r  splitcreate_model_cardr@   rE  )rH   re   r  r  rI   s       rJ   rE  PPOTrainer._save_checkpoint  sj    99!!)dii22388J//55c:2>J*5 .rL   r  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsa          @article{mziegler2019fine-tuning,
            title        = {{Fine-Tuning Language Models from Human Preferences}},
            author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
            year         = 2019,
            eprint       = {arXiv:1909.08593}
        }PPOz2Fine-Tuning Language Models from Human Preferencesz
1909.08593)
base_modelr  r   r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)rz   r   re   r   r   pathisdirr  setr   straddenvironrT  r   textwrapdedentr.   r   r   r  r  urlr/   savejoinrc   r   )rH   r  r  r  r  citation
model_cards          rJ   r  PPOTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%L!

 	TYY%9%9;GHrL   )#r   rc   r   rl   r   r   ri   r   r   r   rj   r   r   r   r   r   r   r   r   re   r   r  r   r   r   rd   r   rf   rg   r   r   r   rh   r   rC   )NN)NNNN)NF)F)NNN))rW   rX   rY   rZ   __doc__r   r&   r   r   r   r   r   r   r9  Moduler   r   dictr  tupler   optim	Optimizerr   LambdaLRr  r   rA   r   r   r   r   r   boolr   r  rG  rE  r  r[   r\   r]   s   @rJ   r_   r_   d   s   "H J <@EIVb59.2!PNPN #)+=?UWeef
PN yyPN BII&PN iiPN PN YYPN   78PN uWd3<.@%@ABPN %++//1I1I1R1RRSPN D12PN  l+!PN" 
#PNdj $Z $ T T.Xc] .4 . .R^h
>T >B/ %)&*,0	@ISM@I sm@I CcD()	@I @IrL   r_   )Yr  r   r   r  r   collectionsr   
contextlibr   r   pathlibr   typingr   r   r  r,  pandasr  r   torch.nnr9  
accelerater	   accelerate.utilsr
   r   datasetsr   torch.utils.datar   transformersr   r   r   r   r   r   r   r   r   r   transformers.integrationsr   transformers.trainerr   r   transformers.trainer_callbackr   r   r   transformers.utilsr   r   corer!   r"   modelsr#   models.utilsr$   
ppo_configr&   utilsr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   peftr7   r8   r9   r  r"  r  r<   r_   rO   rL   rJ   <module>r     s    
  	   # 2  "     " 5  '   J M [ [ C - + 6 !    ( :: 
-BII -yI yIrL   