
    hk                        S SK r S SKrS SKJrJrJrJr  S SKrS SKrS SK	J
r
  S SKJ
s  Jr  S SKJrJr  S SKJrJrJrJrJrJrJrJr  S SKJr  S SKJr  S SKJr  SS	K J!r!J"r"  SS
K#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2  \" 5       (       a  S SK3J4r4  \" 5       (       a  S SK5r5\" 5       (       a  S SK6J7r7   " S S\(5      r8g)    N)AnyCallableOptionalUnion)DatasetIterableDataset)BaseImageProcessorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerCallbackis_apex_availableis_wandb_available)EvalPrediction)OptimizerNames)is_peft_available   )is_conversationalmaybe_apply_chat_template)unwrap_model_for_generation   )BasePairwiseJudge)OnlineDPOTrainer)SIMPLE_CHAT_TEMPLATEempty_cachegenerate_model_cardget_comet_experiment_url
get_rewardselective_log_softmaxtruncate_right)	XPOConfig)amp)	PeftModelc            $       X  ^  \ rS rSrSrSS/r                S)S\\\R                  4   S\\\R                  4   S	\
\R                     S
\
\   S\
\   S\
\   S\
\\\4      S\
\\\\\4   4      S\
\\\\\4      S\
\\\\   4      S\
\   S\
\\/\4      S\
\\      S\\R6                  R8                  \R6                  R:                  R<                  4   S\
\\R>                  \R>                  /\R>                  4      S\
\\\R                  4      SS4"U 4S jjjr \!S 5       r"S r#S r$S r%S r&S r'S r(  SS  jr) S*S\R                  S!\\\\R>                  \*4   4   S"\
\+   S\R>                  4S# jjr,   S+S$\
\   S%\
\   S&\\\\   S4   4S' jjr-S(r.U =r/$ ),
XPOTrainerB   aK  
Initialize XPOTrainer as a subclass of [`OnlineDPOConfig`].

Args:
    model (`transformers.PreTrainedModel`):
        The model to train, preferably an `AutoModelForCausalLM`.
    ref_model (`PreTrainedModelWrapper`):
        Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
        and loss. If no reference model is provided, the trainer will create a reference model with the same
        architecture as the model to be optimized.
    reward_funcs (`transformers.PreTrainedModel`):
        The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
    judge (`BasePairwiseJudge`):
        The judge to use for pairwise comparison of model completions.
    args (`XPOConfig`):
        The XPO config arguments to use for training.
    data_collator (`transformers.DataCollator`):
        The data collator to use for training. If None is specified, the default data collator
        (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
        sequences in the batch, given a dataset of paired sequences.
    train_dataset (`datasets.Dataset`):
        The dataset to use for training.
    eval_dataset (`datasets.Dataset`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If provided, will be used to automatically process the inputs
        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
        reuse the fine-tuned model.
    peft_config (`dict`):
        The peft config to use for training.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.
    callbacks (`list[transformers.TrainerCallback]`):
        The callbacks to use for training.
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.

.. deprecated:: 0.22.0
    The following parameters are deprecated and will be removed in a future version:

    * `reward_model`: Use `reward_funcs` instead. For example, change `reward_model=model` to `reward_funcs=model`.
    * `reward_processing_class`: Use `reward_processing_classes` instead. For example, change
      `reward_processing_class=tokenizer` to `reward_processing_classes=tokenizer`.
trlxpoNNNmodel	ref_modelreward_funcsjudgeargsdata_collatortrain_dataseteval_datasetprocessing_classreward_processing_classespeft_configcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreward_modelreturnc                   > [         TU ]  UUUUUUUUUU	U
UUUUUS9  U R                  R                  U l        / / / / / / / / / / / / / / S.U l        U R                  bf  [        U R                  5      S:w  a  [        S5      eU R                  S   U l        / U R
                  S'   / U R
                  S'   / U R
                  S'   g g )	N)r+   r,   r.   r-   r:   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   )loss/dpoloss/xpoobjective/klobjective/entropyrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/model_contain_eos_tokenval/ref_contain_eos_tokenalphabetar   z3XPOTrainer only supports one reward function/model.r   objective/model_scoresobjective/ref_scoresobjective/scores_margin)	super__init__r/   rI   _alphastatsr-   len
ValueError)selfr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   	__class__s                    Q/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/xpo_trainer.pyrO   XPOTrainer.__init__u   s    , 	%%''%-&?#+!*G! 	 	
& iioo !#  ""$! +-)+#

& (4$$%* !VWW $ 1 1! 4D35DJJ/013DJJ-.46DJJ01 )    c                     [        U R                  [        5      (       aM  U R                  R                  nU[        U R                  5      :  a  U R                  U   $ U R                  S   $ U R                  $ )N)
isinstancerP   liststateepochrR   )rT   r^   s     rV   rI   XPOTrainer.alpha   sY    dkk4((JJ$$E).T[[1A)A4;;u%Vt{{SUV;;rX   c                 Z   [        X R                  5       nUR                  US   US   U R                  S9nS S S 5        U R                  cS  U R                  R                  U5      n[        5       (       a&  [        U[        5      (       a  UR                  5       nO(UnO%U R                  R                  U R                  5      n[        X`R                  5       nUR                  US   US   U R                  S9nS S S 5        WU4$ ! , (       d  f       N= f! , (       d  f       WW4$ = f)N	input_idsattention_mask)ra   rb   generation_config)
r   acceleratorgeneraterc   r,   unwrap_modelr   r[   r$   get_base_model)	rT   promptsr+   unwrapped_policy_model_for_genmodel_output"unwrapped_main_model_for_ref_logicactual_model_for_ref_generationfinal_ref_model_for_gen
ref_outputs	            rV   _generate_completions XPOTrainer._generate_completions   s(   (0@0@AEc9BB!+.&'78"&"8"8 C L B >>!151A1A1N1Nu1U. ""z2TV_'`'`2T2c2c2e/2T/.2.>.>.K.KDNN.[+()HJZJZ[_v099!+.&'78"&"8"8 : J \ Z''3 BA$ \[ Z''s   "D"D
D
D*c                    US   R                   S   nUS S 2US 24   n[        XPR                  R                  U R                  R                  5      u  pV[
        R                  " US   U4SS9[
        R                  " US   U4SS9US   S.nUS S 2US 24   n[        XR                  R                  U R                  R                  5      u  p[
        R                  " US   U4SS9[
        R                  " US   U	4SS9US   S.n
Xz4$ )Nra   r   dimrb   rawra   rb   rt   )shaper!   r3   eos_token_idpad_token_idtorchcat)rT   rj   rn   rh   context_lengthmodel_completion_idsmodel_completion_mask
model_dataref_completion_idsref_completion_maskref_datas              rV   _process_completionsXPOTrainer._process_completions   s*    -33A6  ,A~,>?6D "7"7"D"DdF[F[FhFh7
3 GK$8:N#OUVW#ii1A)BDY(Z`ab5>

 (>?(:;2@ 5 5 B BDDYDYDfDf3
/ GK$8:L#MSTU#ii1A)BDW(X^_`5>
 ##rX   c                    [         R                  " 5          [        U R                  US   U R                  R
                  U5      u  pEn[        U R                  US   U R                  R
                  U5      u  pFnS S S 5        U R                  R                  b  [         R                  " US   U R                  R                  :H  SS9n[         R                  " US   U R                  R                  :H  SS9nWU) ==   U R                  R                  -  ss'   WU) ==   U R                  R                  -  ss'   WW4$ ! , (       d  f       N= f)Nra   rZ   rr   )
ry   no_gradr   r-   r3   rx   r/   missing_eos_penaltyanyrw   )	rT   r~   r   r{   _model_scores
ref_scoresmodel_contain_eosref_contain_eoss	            rV   _compute_rewardsXPOTrainer._compute_rewards   s!   ]]_!+!!:k#:D<Q<Q<^<^`n"AQ  *!!8K#8$:O:O:\:\^l A1	  99((4 %		*[*ATEZEZEgEg*gmo p#ii(=AVAVAcAc(ciklO++,		0M0MM,'(DII,I,II(Z'' _s   A%E  
Ec           	         US   nU R                   R                  US   S S 2US 24   SS9nU Vs/ s H  ofR                  5       PM     nnU R                   R                  US   S S 2US 24   SS9nU Vs/ s H  ofR                  5       PM     nn[        SUS   05      (       a  U Vs/ s H	  nSUS./PM     nn[        R
                  " 5       nUR                  [        5      n	U V
s/ s H  oR                  U
S	9PM     nn
U Vs/ s H  oiR                  US	9PM     nnU Vs/ s H	  nSUS./PM     nnU Vs/ s H  oiR                  US	9PM     nnU R                  R                  U[        [        XW5      5      5      n[        R                  " U Vs/ s H  oS:H  PM	     snUS   R                  S
9$ s  snf s  snf s  snf s  sn
f s  snf s  snf s  snf s  snf )Nrt   ra   T)skip_special_tokenspromptr   	assistant)rolecontent)messages)device)r3   batch_decodestripr   jinja2Environmentfrom_stringr   renderr.   r\   zipry   tensorr   )rT   r~   r   r{   rh   model_data_completions
completionref_data_completionsenvironmenttemplatemessageranks_of_first_completionranks                rV   _compute_judgeXPOTrainer._compute_judge  s   U#!%!6!6!C!C{#A~$67T "D "
 H^!^G]"2"2"4G]!^#44AA[!!^_"454  B  
 FZZEYz 0 0 2EYZh
344Qg&Qg:+*=>Qg # & !,,.K"../CDHHOPW8GP]s%t]szoozo&J]s"%t Rf$Qe:+*=>Qe ! $ \p#p[oZOOZO$H[o #p$(JJ$4$4+BC%
! ||3LM3L4QY3LMV`alVmVtVtuu9 "_
  [&
 Q%t$ $q Ns/   F85F=)G)GG%G;GGc                 F  ^ U4S jnU" X5      nU" X5      n[         R                  " 5          U R                  c*  UR                  5          U" X5      nU" X5      n	S S S 5        O&U" U R                  U5      nU" U R                  U5      n	S S S 5        US   S S 2TS 24   S:H  n
US   S S 2TS 24   S:H  nUR	                  U
S5      nUR	                  US5      nW	R	                  US5      n	WR	                  U
S5      nXgX4$ ! , (       d  f       N= f! , (       d  f       N= f)Nc                    > U " US   US   S9nUR                   S S 2TS-
  S24   n[        X1S   S S 2TS 24   5      nU$ )Nra   rb   )rb   r   rZ   )logitsr    )mdataoutputr   token_logprobsr{   s        rV   compute_logprobs_for_data?XPOTrainer._compute_logprobs.<locals>.compute_logprobs_for_data1  sZ    tK(>N9OPF]]1nq&82&=#=>F26;LQP^P_M_;`aN!!rX   rb   r   g        )ry   r   r,   disable_adaptermasked_fill)rT   r+   r~   r   r{   r   model_logprobs_model_datamodel_logprobs_ref_dataref_logprobs_model_dataref_logprobs_ref_datamodel_padding_maskref_padding_masks       `       rV   _compute_logprobsXPOTrainer._compute_logprobs0  s9   	" %>e$P!";E"L ]]_~~%**,.G.Z+,Ee,V) -, +DDNNT^*_'(A$..RZ([%  ((89!^_:LMQRR#$45a6HIQN$=$I$IJ\^a$b!"9"E"EFVX["\ 5 A ABRTW X"9"E"EFXZ]"^(CXqq -, _s#   DD/D
D	D
D c                    UR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n	[        R                  " XVU5      n
[        R                  " XYU5      nX-
  n[        R                  " U) Xg5      n[        R                  " U) X5      nX-
  nX-
  nU R                  R                  S:X  a%  [
        R                  " U R                  U-  5      * nOUU R                  R                  S:X  a  USSU R                  -  -  -
  S-  nO"[        SU R                  R                   35      eU R                  U-  nUU-   R                  5       nUUU4$ )Nr   sigmoidipor   zinvalid loss type )sumry   wherer/   	loss_typeF
logsigmoidrJ   NotImplementedErrorrI   mean)rT   r   r   r   r   chosen_maskmodel_logprobs_model_data_summodel_logprobs_ref_data_sumref_logprobs_ref_data_sumref_logprobs_model_data_sumchosen_model_logprobschosen_ref_logprobschosen_log_ratiosrejected_model_logprobsrejected_ref_logprobsrejected_log_ratiosr   
dpo_losses
xpo_losseslosss                       rV   _compute_lossesXPOTrainer._compute_lossesP  s\    )B(E(Ea(H%&=&A&A!&D#$9$=$=a$@!&=&A&A!&D# %KXs t#kk+Tmn1G"'++{l<Y"w %[L:U q5M #899)+,,tyy6'9::JYY  E) 1DII#661<J%(:499;N;N:O&PQQ ZZ"==
 Z'--/Z++rX   c                 	  ^  U 4S jnT R                   S   R                  U" U5      5        T R                   S   R                  U" U	5      5        T R                  bn  T R                   S   R                  U" U5      5        T R                   S   R                  U" U5      5        T R                   S   R                  U" X-
  5      5        UR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n[        R
                  " X~U5      n[        R
                  " UUU5      nUU-
  n[        R
                  " U) X5      n[        R
                  " U) UU5      nUU-
  nT R                   S   R                  U" UR                  5       UR                  5       -   5      5        T R                   S	   R                  U" UR                  5       UR                  5       -   5      5        UT R                  -  nUT R                  -  nT R                   S
   R                  U" UR                  5       5      5        T R                   S   R                  U" UR                  5       5      5        X6-
  nXE-
  nUR                  S5      UR                  S5      -   R                  5       S-  nT R                   S   R                  U" U5      5        UR                  S5      * nUR                  S5      * nUR                  5       UR                  5       -   S-  nT R                   S   R                  U" U5      5        UU-
  n T R                   S   R                  U" U R                  5       5      5        U S:  R                  5       n!T R                   S   R                  U" U!R                  5       5      5        US   S S 2U
S 24   T R                  R                  :H  R                  SS9n"US   S S 2U
S 24   T R                  R                  :H  R                  SS9n#T R                   S   R                  U" U"R                  5       5      5        T R                   S   R                  U" U#R                  5       5      5        T R                   S   R                  T R                  5        T R                   S   R                  T R                  5        g )Nc                 r   > TR                   R                  U 5      R                  5       R                  5       $ N)rd   gather_for_metricsr   item)r   rT   s    rV   gather_mean/XPOTrainer._log_statistics.<locals>.gather_mean  s,    ##66v>CCEJJLLrX   r=   r>   rK   rL   rM   r   rE   rF   rA   rB   r   r?   r@   rD   r   rC   ra   rr   rG   rH   rI   rJ   )rQ   appendr-   r   ry   r   r   rJ   floatr3   rw   r   rI   )$rT   r~   r   r   r   r   r   r   r   r   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   chosen_rewardsrejected_rewardskl_model_datakl_ref_datamean_klentropy_model_dataentropy_ref_datamean_entropymarginaccuracy	model_eosref_eoss$   `                                   rV   _log_statisticsXPOTrainer._log_statisticsx  sJ    	M 	

:%%k*&=>

:%%k*&=> (JJ/077L8QRJJ-.55k*6MNJJ0188\E^9_` )B(E(Ea(H%&=&A&A!&D#$9$=$=a$@!&=&A&A!&D# %KXs t#kk+7RTmn14GG"'++{l<Y"w %[L:UWp q58MM

>"))+6K6P6P6RUhUmUmUo6o*pq

#$++K8O8T8T8VYnYsYsYu8u,vw +TYY6.:

#$++K8K8K8M,NO

%&--k:J:O:O:Q.RS 2K-E $$Q'+//!*<<BBDqH

>"))+g*>? 8;;A>>377::*//14D4I4I4KKqP

&'..{</HI  "22

$%,,[-GH QJ%%'

'(//HMMO0LM  ,Q-?@DDYDYDfDffkkpqkr	K(NO);<@U@U@b@bbgglmgn

0188Y__EV9WX

./66{7==?7ST 	

7""4::.

6!!$)),rX   inputsnum_items_in_batchc                 >   UR                  5         [        [        [        UR	                  5       5      5      5      nUS   n[        U5       VVVs/ s H*  obR                  5        VVs0 s H
  u  pxXxU   _M     snnPM,     nnnnU V	s/ s H  n	[        XR                  5      PM     nn	U V	s/ s H<  oR                  XR                  R                  R                  U R                  5      PM>     nn	U R                  U5      nU R                  U5      nUS   R                  S   n
US   US   US.nAU R!                  XQ5      u  pU R#                  XU5      u  pU R$                  b  U R'                  XU
5      u  nnUU:  nOSu  nnU R)                  XU
5      nU R+                  XX5      u  nnnnU R-                  UUUUU5      u  nnnU R/                  UUUR1                  5       UR1                  5       UUUUR1                  5       UR1                  5       U
UU5        U R2                  R4                  b;  U R6                  R8                  U R2                  R4                  -  S:X  a
  [;        5         0 nU R2                  R<                  [>        R@                  [>        RB                  4;   a  U RE                  5       US'   U R2                  RF                  S:  a  URI                  5       nU RJ                  (       a;  [L        RN                  " UU RP                  5       nURS                  5         S S S 5        OU RT                  RR                  " U40 UD6  UR1                  5       U R2                  RV                  -  $ s  snnf s  snnnf s  sn	f s  sn	f ! , (       d  f       NL= f)	Nr   prompt_input_idsr   prompt_attention_maskru   r*   r   learning_rate),trainrR   nextitervaluesrangeitemsr   r3   tokenize_rowr+   configis_encoder_decoderr0   _prepare_inputsrv   ro   r   r-   r   r   r   r   r   detachr/   torch_empty_cache_stepsr]   global_stepr   optimr   LOMOADALOMO_get_learning_raten_gpur   use_apexr#   
scale_loss	optimizerbackwardrd   gradient_accumulation_steps)rT   r+   r   r   
batch_sizerh   ikvxr{   rj   rn   r~   r   r   r   r   r   r   r   r   r   r   r   kwargsscaled_losss                              rV   training_stepXPOTrainer.training_step  sA    	 d6==?345
"@Ej@QR@Q1||~6~tq1d7~6@QROUVv!+A/D/DEvVmstmshi##Azz'8'8'K'KTMbMbcmst##F+ %%f- 2399!< 23$%<=

  $(#=#=g#M   $88SZ[
 ('+'<'<ZSa'b$L*&*4K'1$L*--jNSK ""5hO 	k!#:<QSj
 (,';';%#!#(
$j* 	%,,.#**,!#	
  II--9

&&)J)JJaOM99??~22N4J4JKK&*&=&=&?F?#99??Q99;D==dnn5$$& 65 %%d5f5{{}tyyDDDD_ 7RVtP 65s1   M=(M79M=	N.AN	N7M=
N
model_namedataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsa          @article{jung2024binary,
            title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
            author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
            year         = 2024,
            eprint       = {arXiv:2405.21046}
        }XPOzcExploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHFz
2405.21046)
base_modelr  hub_model_idr  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerohasattrr+   r   ospathisdirr  setr[   straddenvironupdate
_tag_namestextwrapdedentr   r  r   wandbrunurlr   savejoinr/   
output_dir)rT   r  r  r  r  citation
model_cards          rV   create_model_cardXPOTrainer.create_model_card$  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%}!

 	TYY%9%9;GHrX   )rP   r-   rQ   )NNNNNNNNNNNNNr*   NNr   )NNN)0__name__
__module____qualname____firstlineno____doc__r/  r   r   nnModuler   r   r"   r   r   r   dictr+  r   r	   r
   r   r\   r   r   tuplery   r   	Optimizerlr_schedulerLambdaLRTensorrO   propertyrI   ro   r   r   r   r   r   r   r   intr  r:  __static_attributes____classcell__)rU   s   @rV   r&   r&   B   s
   .` J 487;,0-1$(,0CGEI mq&*FJ59VbhlDH)E7_bii/0E7 "))34E7 ryy)	E7
 )*E7 y!E7  )E7  g&> ?@E7 uWd3<.@%@ABE7 #)+=?UWeef
E7 $,E2I4PgKh2h,i#jE7 d^E7 "(N+;T+A"BCE7  D12!E7" %++//1I1I1R1RRS#E7$ (0%,,9UW\WcWc9c0d'e%E7( u_bii%?@A)E7* 
+E7 E7N  (8$6($!vFr@&,h Q-h rvWEYYWE(,S%c8I2J-J(KWEaijmanWE	WEv %)&*,0	@ISM@I sm@I CcD()	@I @IrX   r&   )9r'  r0  typingr   r   r   r   r   ry   torch.nnrA  torch.nn.functional
functionalr   datasetsr   r   transformersr	   r
   r   r   r   r   r   r   transformers.trainer_utilsr   transformers.training_argsr   transformers.utilsr   
data_utilsr   r   models.utilsr   judgesr   online_dpo_trainerr   utilsr   r   r   r   r   r    r!   
xpo_configr"   apexr#   r2  peftr$   r&    rX   rV   <module>r_     s    
  1 1      -	 	 	 6 5 0 E 6 % 0   "   bI! bIrX   