
    hG              	          S SK r S SKJr  S SKJrJr  S SKJr  S SKJ	r	J
r
JrJr  S SKrS SKrS SKJr  S SKJrJr  S SKJr  S SKJr  S S	KJrJrJrJrJrJrJ r J!r!  S S
K"J#r#  S SK$J%r%  S SK&J'r'  S SK(J)r)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0  SSK1J2r2J3r3J4r4J5r5J6r6J7r7J8r8J9r9  \)" 5       (       a  S SK:J;r;  \!" 5       (       a  S SK<r<\Rz                  " \>5      r?S\@\A\B\	   4   SSS\@\A\B\	   4   4S jrC " S S\ 5      rDg)    N)defaultdict)FrozenInstanceErrorreplace)Path)AnyCallableOptionalUnion)PartialStatelogging)gather_object)Dataset)BaseImageProcessorDataCollatorFeatureExtractionMixinPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTraineris_wandb_available)TrainerCallback)nested_detach)EvalPrediction)is_peft_availableis_rich_available   )maybe_apply_chat_template)prepare_peft_model   )RewardConfig)RewardDataCollatorWithPaddingcompute_accuracydecode_and_strip_paddingdisable_dropout_in_modelgenerate_model_cardget_comet_experiment_urllog_table_to_comet_experimentprint_rich_table)	PeftModelbatch	tokenizerr   returnc                 "   / / / / S.n[        U S   U S   5       Hq  u  p4U" U5      nU" U5      nUS   R                  US   5        US   R                  US   5        US   R                  US   5        US	   R                  US   5        Ms     U$ )
z1Tokenize a batch from a reward modelling dataset.)input_ids_chosenattention_mask_choseninput_ids_rejectedattention_mask_rejectedchosenrejectedr.   	input_idsr/   attention_maskr0   r1   )zipappend)r*   r+   new_examplesr2   r3   tokenized_chosentokenized_rejecteds          T/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/reward_trainer.py	_tokenizer<   C   s     !# #%	L  hz1BC$V,&x0'(//0@0MN,-445EFV5WX)*112D[2QR./667IJZ7[\ D     c                     ^  \ rS rSrSrSS/r            S"S\\\\	R                  4      S\\   S\\   S	\\   S
\\\\\\4   4      S\\\\\\4      S\\/ \4      S\\\/\4      S\\\      S\\R4                  R6                  \R4                  R8                  R:                  4   S\\\R<                  \R<                  /\R<                  4      S\\   4U 4S jjjr  S#S\\\	R                  4   S\\\\R<                  \ 4   4   S\\R<                  \\R<                  \\\R<                  4   4   4   4S jjr! S$S\\\	R                  4   S\\\\R<                  \ 4   4   S\"S\\\      S\\\R<                     \\R<                     \\R<                     4   4
S jjr#U 4S jr$S\%4S jr&U 4S jr'   S%S\\   S\\   S\\\\   S4   4S  jjr(S!r)U =r*$ )&RewardTrainerV   a
  
Trainer for custom reward.

Args:
    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module`, *optional*):
        Model to be trained, preferably an [`~transformers.AutoModelForSequenceClassification`].
    args ([`RewardConfig`], *optional*):
        Training arguments.
    data_collator ([`~transformers.DataCollator`], *optional*):
        The data collator to use for training. If None is specified, the default data collator
        [`~trainer.utils.RewardDataCollatorWithPadding`] will be used which will pad the sequences to the maximum
        length of the sequences in the batch, given a dataset of paired sequences.
    train_dataset ([`~datasets.Dataset`], *optional*):
        The dataset to use for training.
    eval_dataset ([`~datasets.Dataset`], *optional*):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
        Processing class used to process the data. If provided, will be used to automatically process the inputs
        for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
        reuse the fine-tuned model.
    model_init (`Callable[[], transformers.PreTrainedModel]`, *optional*):
        The model initializer to use for training. If None is specified, the default model initializer will be
        used.
    compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional*, defaults to [`~trainer.utils.compute_accuracy`]):
        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
        dictionary string to float.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
        return the logits to be used for metrics computation.
    peft_config (`dict`, *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
        wrapped with the specified PEFT adapter.
trlzreward-trainerNmodelargsdata_collatortrain_dataseteval_datasetprocessing_class
model_initcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configc                 h  >^ Uc$  [        5       (       a!  [        U[        5      (       a  [        XU5      nUR                  (       a  [        U5        Uc  [        nUc[  Uc  [        S5      eUR                  m[        U5      nUR                  (       a   SUl
        [        R                  S5        SU l        OSU l        SUR                   S'   SUR"                  ;  a  [%        5       R'                  5          SU0nUR)                  [*        SU0S	9nUR)                  [,        SUUR.                  S
9nUR1                  U4S jUR.                  S9nUbT  UR)                  [*        SU0S	9nUR)                  [,        USUR.                  S9nUR1                  U4S jUR.                  S9nS S S 5        [2        TU ]i  UUUUUUUUU	U
US9  [7        U R8                  S5      (       a&  U R8                  R;                  U R<                  5        g g ! [         a    [        USS9n GNf = f! , (       d  f       N= f)NzYA processing_class must be specified when using the default RewardDataCollatorWithPaddingF)remove_unused_columnszWhen using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig we have set it for you, but you should do it yourself in the future.Testimate_tokensr.   r+   )	fn_kwargs)batchedrQ   num_procc                 V   > [        U S   5      T:*  =(       a    [        U S   5      T:*  $ Nr.   r0   lenx
max_lengths    r;   <lambda>(RewardTrainer.__init__.<locals>.<lambda>   s.    c!$6"78JFu3qQeOfKgkuKuur=   )rS   )rQ   rR   rS   c                 V   > [        U S   5      T:*  =(       a    [        U S   5      T:*  $ rU   rV   rX   s    r;   r[   r\      s4    #a(:&;"<
"J #G"6 78JF#Gr=   )rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   add_model_tags)r   
isinstancer)   r   disable_dropoutr$   r"   
ValueErrorrZ   r!   rO   r   r   loggerwarninguse_reward_data_collatorwarnings_issuedcolumn_namesr   main_process_firstmapr   r<   dataset_num_procfiltersuper__init__hasattrrB   r^   
_tag_names)selfrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rQ   rZ   	__class__s                 @r;   rl   RewardTrainer.__init__   se   ( "'8':':z%QZ?[?[&u4@E $U+".O ' o  J9:JKM))F16D. \
 -1D),1D) 48/0]%?%??224(*:;	 - 1 12KXceuWv 1 w - 1 1 '!22	 !2 ! !. 4 4u!22 !5 !  +#/#3#31kK[=\ $4 $L $0#3#3!"+ $!%!6!6	 $4 $L $0#6#6G!%!6!6 $7 $L; 5F 	''%-!+!*G 	 	
 4::/00JJ%%doo6 1Q + F"4uEDF, 54s   H 7B0H#H H #
H1inputsr,   c                    U" US   US   SS9S   nU" US   US   SS9S   nSU;   a7  [         R                  R                  XV-
  US   -
  5      R                  5       * nO0[         R                  R                  XV-
  5      R                  5       * nU R                  R
                  b4  XpR                  R
                  [        R                  " XV-   S	-  5      -  -  nU(       a  UUUS
.4$ U$ )Nr.   r/   T)r4   r5   return_dictlogitsr0   r1   marginr   )rewards_chosenrewards_rejected)nn
functional
logsigmoidmeanrC   center_rewards_coefficienttorch)ro   rB   rr   return_outputsnum_items_in_batchrw   rx   losss           r;   compute_lossRewardTrainer.compute_loss   s    /0!"9:
 	
 !12!";<
 	 vMM,,^-NQWX`Qa-abggiiDMM,,^-NOTTVVD99//;II885::~GhmnFn;oooD"0$4   r=   prediction_loss_onlyignore_keysc                   ^ U R                  U5      nTc?  [        U R                  S5      (       a"  [        U R                  R                  S/ 5      mO/ m[
        R                  " 5          U R                  XSS9u  pVS S S 5        U(       a  WS S 4$ WR                  5       n[        U4S jWR                  5        5       5      n[        U5      n[
        R                  " U5      R                  SS9R                  SS9R                  n[
        R                   " UR"                  S   5      nU R                  U5      nXWU4$ ! , (       d  f       N= f)	Nconfigkeys_to_ignore_at_inferenceT)r   c              3   <   >#    U  H  u  pUT;  d  M  Uv   M     g 7fN ).0kvr   s      r;   	<genexpr>0RewardTrainer.prediction_step.<locals>.<genexpr>+  s     Q%8TQA[<Pqq%8s   	r   )dimr   )_prepare_inputsrm   rB   getattrr   r~   no_gradr   detachtupleitemsr   stackr|   softmaxTzerosshape)	ro   rB   rr   r   r   r   logits_dictru   labelss	       `    r;   prediction_stepRewardTrainer.prediction_step  s    %%f-tzz8,,%djj&7&79VXZ[ ]]_ $ 1 1%PT 1 UD   $%%{{}Q[%6%6%8QQv& V$))a)088Q8?AAV\\!_-%%f-V##! _s   *E
Ec                 j   > UR                  SS5      nU R                  U5        [        TU ]  " U0 UD6$ )Nnum_print_samples   )popvisualize_samplesrk   evaluate)ro   rC   kwargsr   rp   s       r;   r   RewardTrainer.evaluate6  s9    "JJ':A>01w000r=   r   c                     U R                  5       n[        [        5      n[        U5       GH  u  pEU R	                  U R
                  USS9u  pFn[        US   U R                  5      n[        US   U R                  5      nUS   R                  [        U5      5        US   R                  [        U5      5        US   R                  [        UR                  5        V	V
s/ s H  o V
s/ s H  n
[        U
S5      PM     sn
PM!     sn
n	5      5        US	:  d  M  [        US   5      U:  d  GM    O   [        R                  " U5      nU R                  R                   S	:X  a  [#        5       (       a  [%        US
U 5        SU R&                  R(                  ;   a3  S	S
KnUR,                  b"  UR.                  " SUR0                  " US905        SU R&                  R(                  ;   a  [3        SUS9  g
g
g
s  sn
f s  sn
n	f )z
Visualize the reward model logits prediction

Args:
    num_print_samples (`int`, defaults to `4`):
        The number of samples to print. Set to `-1` to print all samples.
F)r   r.   r0   chosen_textrejected_textru   r   r   Nwandbcompletions)	dataframecomet_mlzcompletions.csv)nametable)get_eval_dataloaderr   list	enumerater   rB   r#   rG   extendr   tolistroundrW   pd	DataFrameacceleratorprocess_indexr   r(   rC   	report_tor   runlogTabler'   )ro   r   eval_dataloaderr   _rr   ru   r   r   item
inner_itemdfr   s                r;   r   RewardTrainer.visualize_samples;  s    224D!"?3IA//

FY^/_LAq26:L3MtOdOdeK4V<P5QSWShShiM- ''k(BC/"))-*FG(O""Y_YfYfYhiYhQUtLtj! 4tLYhij !A%#eM.B*CGX*X 4 \\% ))Q. "" $6%6!78$))---99(II}ekkB.GHITYY000-* 1 /  Mis   !	G:*G5 G:5G:c                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )N/)
model_name)	rC   hub_model_idr   
output_dirr   splitcreate_model_cardrk   _save_checkpoint)ro   rB   trialr   rp   s       r;   r   RewardTrainer._save_checkpointa  sj    99!!)dii22388J//55c:2>J*5 .r=   r   dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        UUU R                   UU[#        5       (       a+  [$        R&                  b  [$        R&                  R(                  OS[+        5       SS9nUR-                  [        R
                  R/                  U R0                  R2                  S	5      5        g)
a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsReward)
base_modelr   r   r   r   	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zerorm   rB   r   ospathisdirr   setr_   straddenvironupdatern   r%   r   r   r   r   urlr&   savejoinrC   r   )ro   r   r   r   r   
model_cards         r;   r   RewardTrainer.create_model_cardi  sN   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$(!!**%'9';';		@Ueiimm[_.0!	

 	TYY%9%9;GHr=   )rd   )NNNNNNNNN)NNNN)FNr   )NNN)+__name__
__module____qualname____firstlineno____doc__rn   r	   r
   r   ry   Moduler    r   r   dictr   r   r   r   r   r   r   r   r   r   r~   optim	Optimizerlr_schedulerLambdaLRTensorrl   r   r   boolr   r   intr   r   r   __static_attributes____classcell__)rp   s   @r;   r?   r?   V   s   $L )*J >B'+04+/EI >BFJ59W
 im&*%t7oryy89:t7 |$t7  -	t7
  (t7 uWd3<.@%@ABt7 #)+=?UWeef
t7 Xb/&9:;t7 "(N+;T+A"BCt7 D12t7 %++//1I1I1R1RRSt7" (0%,,9UW\WcWc9c0d'e#t7$ d^%t7 t7t _bii/0 S%c 1223 
u||U5<<c5<<6G1H#HII	JL ,0$_bii/0$ S%c 1223$ #	$
 d3i($ 
x%x'=x?UU	V$@1
#3 #L/ %)&*,0	4ISM4I sm4I CcD()	4I 4Ir=   r?   )Er   collectionsr   dataclassesr   r   pathlibr   typingr   r   r	   r
   pandasr   r~   torch.nnry   
accelerater   r   accelerate.utilsr   datasetsr   transformersr   r   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_pt_utilsr   transformers.trainer_utilsr   transformers.utilsr   r   
data_utilsr   modelsr   reward_configr    utilsr!   r"   r#   r$   r%   r&   r'   r(   peftr)   r   
get_loggerr   rb   r   r   r   r<   r?   r   r=   r;   <module>r     s    
 # 4  1 1    , * 	 	 	 : 7 5 C 2 ' '	 	 	  
		H	%T#tCy.) 6O TXY\^bcf^gYgTh &GIG GIr=   