
    hMU                        S SK r S SKrS SKrS SKJrJrJrJr  S SKrS SK	J
r
  S SKJ
s  Jr  S SKJr  S SKJrJrJrJrJrJrJrJrJr  S SKJr  S SKJr  S SKJrJ r   SS	K!J"r"  SS
K#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.  \ " 5       (       a  S SK/J0r0  \" 5       (       a  S SK1r1\" 5       (       a  S SK2J3r3   " S S\(5      r4g)    N)AnyCallableOptionalUnion)Dataset)	AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinis_wandb_available)TrainerCallback)EvalPrediction)is_liger_kernel_availableis_peft_available   )prepare_deepspeed)unwrap_model_for_generation   )	GKDConfig)
SFTTrainer)DataCollatorForChatMLdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_url)
PeftConfig)LigerFusedLinearJSDLossc                     ^  \ rS rSrSrSS/r             S!S\\\\	R                  \4      S\\\	R                  \4   S\\   S	\\   S
\\   S\\\\\\4   4      S\\\\\\4      S\\\/\4      S\\\      S\\R4                  R6                  \R4                  R8                  R:                  4   S\\\R<                  \R<                  /\R<                  4      S\S   S\\   4U 4S jjjr\  S"S j5       r!S#S jr"\ S$S j5       r# S$S\	R                  S\\\\R<                  \$4   4   S\\%   S\R<                  4U 4S jjjr&   S%S\\   S\\   S\\\\   S4   4S jjr'S r(U =r)$ )&
GKDTrainer>   a	  Trainer for Generalized Knowledge Distillation (GKD) of language models.

For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
Mistakes](https://huggingface.co/papers/2306.13649).

Args:
    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
    teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
        pretrained model.
    args ([`GKDConfig`], *optional*):
        Training arguments.
    data_collator ([`~transformers.DataCollator`], *optional*):
        Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
        `processing_class`.
    train_dataset ([`~datasets.Dataset`], *optional*):
        Dataset for training.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
       Class to process the data.
    compute_metrics (`Callable`, *optional*):
        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
        dictionary string to float.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training.
    preprocess_logits_for_metrics (`Callable`, *optional*):
        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
        return the logits to be used for metrics computation.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
        wrapped with the specified PEFT adapter.
    formatting_func (`Callable`, *optional*):
        Function to format the dataset. Must take in an example and return an example.
trlgkdNmodelteacher_modelargsdata_collatortrain_dataseteval_datasetprocessing_classcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configr    formatting_funcc                 P  > SUl         Uc  [        XsR                  S9nUR                  c
  SS0Ul        OSUR                  S'   SU l        UR
                  (       a,  [        UR                  SUR                  SS9U l	        SU l        [        TU ]-  UUUUUUUU	U
UUUS9  UR                  c  0 nOP[        U[        5      (       d  [        S5      eUR                  nUS	   S
;   a  US	   O[!        ["        US	   5      US	'   [        U[        5      (       a  [$        R&                  " U40 UD6nUR(                  (       a  [+        U R,                  5        U R.                  (       a  [1        X R2                  5      U l        OU R2                  R7                  USS9U l        UR8                  U l        UR                  U l        UR                  U l        UR:                  U l        [=        UR>                  UR                  SSUR@                  (       a  SOSU RB                  RD                  S9U l#        [I        U R,                  RF                  S5      (       aR  U R,                  RF                  RJ                  b0  U R,                  RF                  RJ                  U RF                  l%        g g g )NF)	tokenizer
max_lengthskip_prepare_datasetT)betaignore_indextemperaturecompiled)r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   zfYou passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated.dtype)autoN)evaluation_moder   )max_new_tokensr;   	do_sampletop_k	use_cachepad_token_ideos_token_id)&remove_unused_columnsr   r6   dataset_kwargsuse_liger_gkd_lossuse_liger_kernelr!   r9   r;   liger_jsd_losssuper__init__teacher_model_init_kwargs
isinstancestr
ValueErrorgetattrtorchr   from_pretraineddisable_dropoutr   r'   is_deepspeed_enabledr   acceleratorr(   prepare_modellmbdaseq_kdr   r@   gradient_checkpointingr-   rD   generation_confighasattrrE   )selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   rM   	__class__s                  Q/home/james-whalen/.local/lib/python3.13/site-packages/trl/trainer/gkd_trainer.pyrL   GKDTrainer.__init__h   sx   & &+" 1<LYhYhiM &#94"@D:>D 67 #(  "9YY! ,,	#D '+D#''%-+!*G#+ 	 	
 ))1(*%M3//x  )-(F(F% -W5G *'2U$=g$FG &g. mS))0@@lRklM $TZZ0$$!2=BRBR!SD!%!1!1!?!?_c!?!dDZZ
II	++kk!1..((#::e..;;"
 DJJ00.AA

,,99E26**2N2N2[2[D""/ F B    c           	      p   X-  n X-  n[         R                  " U SS9n[         R                  " USS9nUS:X  a  [         R                  " XgSSS9nOUS:X  a  [         R                  " XvSSS9nO[        R                  " X6R
                  S9n[        R                  " [        R                  " U[        R                  " SU-
  5      -   U[        R                  " U5      -   /5      SS9n	[         R                  " XSSS9n
[         R                  " XSSS9nX:-  SU-
  U-  -   nUb	  US	:g  nX   nUS
:X  aX  Ub!  UR                  5       WR                  5       -  $ UR                  5       UR                  S5      UR                  S5      -  -  $ US:X  a  UR                  5       $ US:X  a  UR                  5       $ U$ )a  
Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
of https://huggingface.co/papers/2306.13649 for the definition.

Args:
    student_logits:
        Tensor of shape (batch_size, sequence_length, vocab_size)
    teacher_logits:
        Tensor of shape (batch_size, sequence_length, vocab_size)
    labels:
        Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
        loss
    beta:
        Interpolation coefficient between 0 and 1 (default: 0.5)
    temperature:
        Softmax temperature (default: 1.0)
    reduction:
        Specifies the reduction to apply to the output (default: 'batchmean')

Returns:
    loss: Scalar tensor with the generalized JSD loss
)dimr   noneT)	reduction
log_targetr   )r=   r8   	batchmeansummean)Flog_softmaxkl_divrR   tensorr=   	logsumexpstacklogri   sizerj   )student_logitsteacher_logitslabelsr9   r;   rf   student_log_probsteacher_log_probsjsdmixture_log_probs
kl_teacher
kl_studentmasks                r_   generalized_jsd_lossGKDTrainer.generalized_jsd_loss   s   8 (5'5 MM.bAMM.bA19((,6^bcCQY((,6^bcC <<,C,CDD %.1t81DDFWZ_ZcZcdhZiFijk! "3RXeijJ"3RXeijJ #q4x:&==C T>D)C #-3-?3779txxz)lSWWYRURZRZ[\R]`c`h`hij`kRkEll%779& 88:Jra   c                 ~   U R                   (       Ga%  U R                  R                  U5      n[        US5      (       a"  UR	                  5       b  UR	                  5       nO[        U[        USS5      U5      nU" US   US   SSS9nU R                  R                  5         U R                  R                  U R                  5      n[        US5      (       a"  UR	                  5       b  UR	                  5       n	O[        U[        USS5      U5      n	[        R                  " 5          U	" US   US   SSS9n
S S S 5        UR                  S S 2S S	24   R                  5       nW
R                  S S 2S S	24   R                  5       nUS
   S:g  n[        R                  " XS   [        R                  " US   S5      5      nUS S 2SS 24   R                  5       nUR                  5       nUR                  5       nU R                  UUR                   UUR                   U[        USS 5      [        USS 5      S9nOU" US   US   S9nU R                  R                  5         [        R                  " 5          U R                  US   US   S9n
S S S 5        US   R"                  S   nUR$                  S S 2US-
  S	2S S 24   nW
R$                  S S 2US-
  S	2S S 24   nUS
   S S 2US 24   nU R'                  UUUU R(                  S9n[+        5         U(       a  UU4$ U$ ! , (       d  f       GN= f! , (       d  f       N= f)Nget_decoderbase_model_prefixr'   	input_idsattention_maskTF)r   r   output_hidden_statesrC   rc   ru   r8   r   bias)student_inputstudent_weightteacher_inputteacher_weighttrue_labelsstudent_biasteacher_bias)r   r   prompts)rs   rt   ru   r9   )rH   rV   unwrap_modelr\   r   rQ   r(   evalrR   no_gradlast_hidden_state
contiguouswhere	full_likeget_output_embeddingsrJ   weightshapelogitsr}   r9   r   )r]   r'   inputsreturn_outputsnum_items_in_batchunwrapped_studentbase_studentstudent_outputsunwrapped_teacherbase_teacherteacher_outputsstudent_hiddenteacher_hiddenlabels_maskmasked_input_idsr   student_headteacher_headlossprompt_lengthsshifted_student_logitsshifted_teacher_logitsshifted_labelss                          r_   compute_lossGKDTrainer.compute_loss  sn   """ $ 0 0 = =e D(-88=N=Z=Z=\=h0<<>&%w/@BUW^'_ar  + -%&67%)	O ##% $ 0 0 = =d>P>P Q(-88=N=Z=Z=\=h0<<>&%w/@BUW^'_ar  ".$[1#)*:#;)-#	# ! ->>q#2#vFQQSN,>>q#2#vFQQSN !*d2K${{K0%//&BUW[2\  +1ab51<<>K -BBDL,BBDL &&,+22,+22'$\64@$\64@ ' D $ -%&67O ##%"&"4"4$[1#)*:#; #5 # ! $I.44Q7N%4%;%;A~PQ?QTV?VXY<Y%Z"%4%;%;A~PQ?QTV?VXY<Y%Z"#H-a.@AN ,,55%YY	 - D 	 +9o&BdBE !T !s   'L4L.
L+.
L<c                     U R                  US   UR                  SS 5      USS9nUR                  n[        R                  " U5      nUR                  5       nUb  SXwU:H  '   SXeU:H  '   XVU4$ )Nr   prompt_attention_maskT)r   r   r[   return_dict_in_generater8   r   )generateget	sequencesrR   	ones_likeclone)r'   r   r[   rD   generated_outputsgenerated_tokensnew_attention_mask
new_labelss           r_   generate_on_policy_outputs%GKDTrainer.generate_on_policy_outputsw  s     "NNY'!::&=tD/$(	 + 
 -66"__-=>%++-
 #59J\12CD<?@Z??ra   r   r   returnc                 n  > U R                   (       al  [        U R                  U R                  5       nU R	                  XBU R
                  U R                  R                  5      u  pVnSSS5        WUS'   WUS'   WUS'   [        R                  " 5       U R                  ::  aa  [        XR                  5       nU R	                  XBU R
                  U R                  R                  5      u  pVnSSS5        WUS'   WUS'   WUS'   [        T	U ]-  XU5      nU$ ! , (       d  f       N= f! , (       d  f       N@= f)a9  
Perform a training step for the Generalized Knowledge Distillation (GKD) model.

This method implements the on-policy learning approach described in the GKD paper. With probability
`self.lmbda`, it generates new responses using the student model, which are then used for training instead of
the original inputs.
Nr   r   ru   )rY   r   r(   rV   r   r[   r-   rD   randomrX   rK   training_step)
r]   r'   r   r   unwrapped_modelnew_input_idsr   r   r   r^   s
            r_   r   GKDTrainer.training_step  s    ;;,T-?-?AQAQRVe@D@_@_#T-C-CTEZEZEgEgA=: S #0F;'9F#$)F8==?djj(,U4D4DE@D@_@_#T-C-CTEZEZEgEgA=: F #0F;'9F#$)F8w$U4FG# SR FEs   5D75D&
D#&
D4
model_namedataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsan          @inproceedings{agarwal2024on-policy,
            title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
            author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
            year         = 2024,
            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=3zKtaqxLhW},
        }GKDzPOn-Policy Distillation of Language Models: Learning from Self-Generated Mistakesz
2306.13649)
base_modelr   hub_model_idr   r   	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror\   r'   configospathisdirr   setrN   rO   addenvironupdate
_tag_namestextwrapdedentr   r   r   wandbrunurlr   savejoinr)   
output_dir)r]   r   r   r   r   citation
model_cards          r_   create_model_cardGKDTrainer.create_model_card  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%j!

 	TYY%9%9;GHra   )r9   r[   rJ   rX   rY   r(   r;   rH   )NNNNNNNNN)NNNNN)Ng      ?g      ?rh   )FN)N)NNN)*__name__
__module____qualname____firstlineno____doc__r   r   r   r   nnModulerO   r   r
   r   dictr   r	   r   r   r   r   listr   tuplerR   optim	Optimizerlr_schedulerLambdaLRTensorrL   staticmethodr}   r   r   r   intr   r   __static_attributes____classcell__)r^   s   @r_   r#   r#   >   s   %N J CG@D$(04+/EI FJ59Vbhl.2.2!h\oryy#=>?h\ _bii<=h\ y!	h\
  -h\  (h\ uWd3<.@%@ABh\ #)+=?UWeef
h\ "(N+;T+A"BCh\ D12h\ %++//1I1I1R1RRSh\ (0%,,9UW\WcWc9c0d'eh\ l+h\  "(+!h\ h\T ZeD DL\C| @ @. rvYY(,S%c8I2J-J(Kaijman	 @ %)&*,0	BISMBI smBI CcD()	BI BIra   r#   )5r   r   r   typingr   r   r   r   rR   torch.nnr   torch.nn.functional
functionalrk   datasetsr   transformersr   r	   r
   r   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   r   modelsr   models.utilsr   
gkd_configr   sft_trainerr   utilsr   r   r   r   r   peftr    r   liger_kernel.chunked_lossr!   r#    ra   r_   <module>r     s    
   1 1     
 
 
 : 5 K & 6 ! #  ApI pIra   