
    f:i              
          S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJrJrJrJ
r
JrJrJrJrJrJ r JrJ!r!J"r"J#r#J$r$J%r%JrJ&r&J'r'J(r(J)r)JrJ*r*  SSK&r&SSK7  SSK+J,r,J-r-  SS	K.J/r/  SSKrSSK0r1SS
K2J3r3  SSKJr  SSK4J5r5J6r7  SSK8J9r9  SSK:r:SSK;J<r<  S r= SSSSSS.r>\R~                  " SS\>S9S 5       r@S\R                  S\AS\AS\R                  4S jrBS\R                  S\R                  S\AS\AS\R                  4
S jrCS\R                  S\AS\R                  4S jrD\, " S  S!\5      5       rE  " S" S#\5      rF " S$ S%\F5      rGg)&z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)!r   AutoModelForCausalLMBaseImageProcessorr   DataCollatorDataCollatorForChatMLDatasetEvalPredictionFFeatureExtractionMixin	GKDConfig
GKDTrainerGenerationConfigr   
PeftConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixin
SFTTrainerTrainerCallbackr	   disable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_urlis_wandb_availablennosprepare_deepspeedrandomtextwraptorchunwrap_model_for_generation)*)	dataclassfield)Version)nullcontext)DataCollatorForSeq2SeqDataCollatorForLanguageModeling)ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrr5   r6   r7   )selfargskwargsoutputfs       >/home/james-whalen/unsloth_compiled_cache/UnslothGKDTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)r=   r?   s   ` r>   prepare_for_training_moderD   /   s%    __Q  NrA   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rO   indexrO      )r(   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrP   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             r>   chunked_selective_log_softmaxrh   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYrA   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
rR   z8logits_to_keep must be smaller than the sequence length.NrQ   )rU   
ValueErrorsum)ri   rj   rk   prompt_sectionpadding_maskpad_token_countss         r>   calculate_pad_tokens_in_promptrs   W   sX     ++STTq"2N?"223N"2L#''A'.rA   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
)devicer   rR   )rU   rx   r(   arangerZ   )rt   ru   rv   rk   
batch_sizecompletion_lenrx   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               r>    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.JrA   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
rR   T)rO   
descendingstable)r(   argsortrY   )r   r   masksorted_indicespacked_tensors        r>   left_pack_paddingr      s8     D]]4Q4MNLLN;MrA   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'   \" SSS0S9r\\   \	S'                                                                                                                                                                SU 4S jjrSrU =r$ )UnslothGKDConfig   a  
    
Configuration class for [`GKDTrainer`].

This class includes only the parameters that are specific to GKD training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation.

Args:
    temperature (`float`, *optional*, defaults to `0.9`):
        Temperature for sampling. The higher the temperature, the more random the completions.
    lmbda (`float`, *optional*, defaults to `0.5`):
        Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy
        student-generated outputs).
    beta (`float`, *optional*, defaults to `0.5`):
        Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When
        beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
    max_new_tokens (`int`, *optional*, defaults to `128`):
        Maximum number of tokens to generate per completion.
    teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
        Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
        trained.
    teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
        Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
        from a string.
    disable_dropout (`bool`, *optional*, defaults to `True`):
        Whether to disable dropout in the model.
    seq_kd (`bool`, *optional*, defaults to `False`):
        Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on
        teacher-generated output).

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsrL   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksz'Maximum sequence length to truncate to.max_seq_lengthc                 p  > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#Wc$  S
SKJn  [        [	        U" 5       S-   S5      S5      n[
        R                  R                  SS5      S:X  a  S
SKJ	n  U(       a  Wc  S
SKJ
n  UnWS
::  a  [        S5      eWS:  a  [        S5      e[        TU ]4  " S0 SU_SU_SU_SU_SU_SU_SU_SU_SU	_S U
_S!U_S"U_S#U_S$U_S%U_S&U_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U_S2U_S3U_S4U_S5U_S6U _S7U!_S8U"_S9U#_S:U$_S;U%_S<U&_S=U'_S>U(_S?U)_S@U*_SAU+_SBU,_SCU-_SDU._SEU/_SFU0_SGU1_SHU2_SIU3_SJU4_SKU5_SLU6_SMU7_SNU8_SOU9_SPU:_SQU;_SRU<_SSU=_STU>_SUU?_SVW@_SWWA_SXWB_SYWC_SZWD_S[WE_S\WF_S]WG_S^WH_S_WI_S`WJ_SaWK_SbWL_ScWM_SdWN_SeWO_SfWP_SgWQ_ShWR_SiWS_SjWT_SkWU_SlWV_SmWW_SnWX_SoWY_SpWZ_SqW[_SrW\_SsW]_StW^_SuW__SvW`_SwWa_SxWb_SyWc_SzWd_S{We_S|Wf_S}Wg_S~Wh_SWi_SWj_SWk_SWl_SWm_SWn_SWo_SWp_SWq_SWr_SWs_SWt_SWu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l        WU l        g )NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rR   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!steps  unsloth_training_checkpointsnor   )	cpu_countrM      @   UNSLOTH_ENABLE_FLEX_ATTENTION01)HAS_FLEX_ATTENTION)FLEX_ATTENTION_BLOCK_SIZEzUUnsloth: Please set a positive non-zero temperature since your results will be wrong.
   zgUnsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16fp16fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesmodel_init_kwargschat_template_pathdataset_text_fielddataset_kwargsdataset_num_proc	eos_token	pad_token
max_lengthpackingpacking_strategypadding_freepad_to_multiple_ofeval_packingcompletion_only_lossassistant_only_loss	loss_typeactivation_offloadingtemperaturelmbdabetamax_new_tokensteacher_model_name_or_pathteacher_model_init_kwargsdisable_dropoutseq_kd )printmultiprocessingr   minmaxr$   environgetunsloth_zoo.flex_attentionr   r   	MathErrorsuper__init__r   r   r   )r9   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r   r   r   r;   r   r   r   	__class__s                                                                                                                                                                     r>   rG  UnslothGKDConfig.__init__   s   B 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M#1"3y{1}a#8"=::>>93?3FE!&8&@P%>"!sttB  F  G  G 	 Z	&#Z	&#7Z	&  Z	& 	Z	&
 $Z	& *Z	& $8Z	& +FZ	& *DZ	& (@Z	& '>Z	& +FZ	& '>Z	& $Z	& '>Z	&  *!Z	&" (#Z	&$ $%Z	&& $'Z	&( ()Z	&* *+Z	&,  0-Z	&. "/Z	&0 !21Z	&2 (3Z	&4 (5Z	&6 "7Z	&8 !29Z	&:  0;Z	&< &=Z	&>  0?Z	&@ "4AZ	&B *CZ	&D &<EZ	&F *GZ	&H $IZ	&J  0KZ	&L  0MZ	&N !2OZ	&P .QZ	&R 7^SZ	&T UZ	&V WZ	&X ,YZ	&Z [Z	&\ "]Z	&^ *_Z	&`  aZ	&b cZ	&d eZ	&f ,gZ	&h &<iZ	&j ,kZ	&l ,mZ	&n oZ	&p $qZ	&r &sZ	&t *uZ	&v !2wZ	&x yZ	&z $8{Z	&| $}Z	&~ &<Z	&@ *DAZ	&B $CZ	&D  EZ	&F (GZ	&H %:IZ	&J &KZ	&L &<MZ	&N %:OZ	&P !2QZ	&R  0SZ	&T UZ	&V #6WZ	&X &YZ	&Z 2T[Z	&\ "4]Z	&^ "4_Z	&` "aZ	&b &<cZ	&d eZ	&f $gZ	&h "iZ	&j .kZ	&l "4mZ	&n "oZ	&p *DqZ	&r !2sZ	&t %:uZ	&v %:wZ	&x -JyZ	&z #6{Z	&| *D}Z	&~ &Z	&@ &<AZ	&B (CZ	&D (EZ	&F "GZ	&H  0IZ	&J .KZ	&L (MZ	&N &<OZ	&P -JQZ	&R *DSZ	&T &<UZ	&V (WZ	&X $8YZ	&Z (@[Z	&\ !2]Z	&^ *_Z	&` $8aZ	&b  0cZ	&d &eZ	&f "gZ	&h &iZ	&j *kZ	&l %:mZ	&n "4oZ	&p )BqZ	&r -JsZ	&t #6uZ	&v $8wZ	&x "4yZ	&z *{Z	&|  0}Z	&~ #6Z	&@ &<AZ	&B -JCZ	&D !2EZ	&F "4GZ	&H "4IZ	&J ,KZ	&L  0MZ	&N "OZ	&P "QZ	&R $SZ	&T UZ	&V  0WZ	&X (YZ	&Z "4[Z	&\ (]Z	&^ $8_Z	&` #6aZ	&b "cZ	&d %:eZ	&f &gZ	&h iZ	&j kZ	&l ,mZ	&n *DoZ	&p )BqZ	&r .sZ	&t fuZ	&v %9!"4,rA   )r   r   r   )NNFFFr   FrM   rM   NNr   r   r      g-C6
?g{Gz??g+?g:0yE>      ?g      @rL   linear皙?r   passivewarningTNr   FrR   Fr   r   NTFFFFFFO  rQ  FFFFO1autoFFNrL   NNF FNr   NrL   NNTNFNNFrT  r   NNNNN        
adamw_8bitNFFlengthNNNNTFTFFNN
every_saveNNFNTNFTrS  NNNrT  FFNlasti  FNNFFNNFFFNFTNNtextNNNNi   FbfdFNNNFnllFrK        ?r]     NNTFNrL   N)__name__
__module____qualname____firstlineno____doc__r,   r   r   r   __annotations__r   intr   rG  __static_attributes____classcell__rH  s   @r>   r   r      sY   > +012+(3-  */VW*#  &+EF&NXc]  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) $!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(, !# !## %%)$(#}R- R-rA   r   c                     ^  \ rS rSrSrSS/r             S!S\\\\	R                  \4      S\\\	R                  \4   S\\   S	\\   S
\\   S\\\\\\4   4      S\\\\\\4      S\\\/\4      S\\\      S\\R4                  R6                  \R4                  R8                  R:                  4   S\\\R<                  \R<                  /\R<                  4      S\S   S\\   4U 4S jjjr\  S"S j5       r!S#S jr"\ S$S j5       r# S$S\	R                  S\\\\R<                  \$4   4   S\\%   S\R<                  4U 4S jjjr&   S%S\\   S\\   S\\\\   S4   4S jjr'S r(U =r)$ )&_UnslothGKDTraineri  a	  Trainer for Generalized Knowledge Distillation (GKD) of language models.

For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
Mistakes](https://huggingface.co/papers/2306.13649).

Args:
    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
    teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
        pretrained model.
    args ([`GKDConfig`], *optional*):
        Training arguments.
    data_collator ([`~transformers.DataCollator`], *optional*):
        Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
        `processing_class`.
    train_dataset ([`~datasets.Dataset`], *optional*):
        Dataset for training.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
       Class to process the data.
    compute_metrics (`Callable`, *optional*):
        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
        dictionary string to float.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training.
    preprocess_logits_for_metrics (`Callable`, *optional*):
        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
        return the logits to be used for metrics computation.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
        wrapped with the specified PEFT adapter.
    formatting_func (`Callable`, *optional*):
        Function to format the dataset. Must take in an example and return an example.
trlgkdNr5   teacher_modelr:   data_collatortrain_dataseteval_datasetprocessing_classcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configr   formatting_funcc                 P  > SUl         Uc  [        XsR                  S9nUR                  c
  SS0Ul        OSUR                  S'   SU l        UR
                  (       a,  [        UR                  SUR                  SS9U l	        SU l        [        TU ]-  UUUUUUUU	U
UUUS9  UR                  c  0 nOP[        U[        5      (       d  [        S5      eUR                  nUS	   S
;   a  US	   O[!        ["        US	   5      US	'   [        U[        5      (       a  [$        R&                  " U40 UD6nUR(                  (       a  [+        U R,                  5        U R.                  (       a  [1        X R2                  5      U l        OU R2                  R7                  USS9U l        UR8                  U l        UR                  U l        UR                  U l        UR:                  U l        [=        UR>                  UR                  SSUR@                  (       a  SOSU RB                  RD                  S9U l#        [I        U R,                  RF                  S5      (       aR  U R,                  RF                  RJ                  b0  U R,                  RF                  RJ                  U RF                  l%        g g g )NF)	tokenizerr+  skip_prepare_datasetT)r7  ignore_indexr5  compiled)r:   rn  ro  rp  rq  rr  rs  rt  ru  rv  rw  zfYou passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated.dtype)rS  N)evaluation_moder   )r8  r5  	do_sampletop_k	use_cacherk   eos_token_id)&r   r   r+  r'  use_liger_gkd_lossr   LigerFusedLinearJSDLossr7  r5  liger_jsd_lossrF  rG  r:  
isinstancestrrn   getattrr(   r   from_pretrainedr;  r   r5   is_deepspeed_enabledr%   acceleratorrm  prepare_modelr6  r<  r   r8  r	  rq  rk   generation_configr8   r  )r9   r5   rm  r:   rn  ro  rp  rq  rr  rs  rt  ru  rv  rw  r:  rH  s                  r>   rG  _UnslothGKDTrainer.__init__:  sx   & &+" 1<LYhYhiM &#94"@D:>D 67 #(  "9YY! ,,	#D '+D#''%-+!*G#+ 	 	
 ))1(*%M3//x  )-(F(F% -W5G *'2U$=g$FG &g. mS))0@@lRklM $TZZ0$$!2=BRBR!SD!%!1!1!?!?_c!?!dDZZ
II	++kk!1..((#::e..;;"
 DJJ00.AA

,,99E26**2N2N2[2[D""/ F BrA   c           	      p   X-  n X-  n[         R                  " U SS9n[         R                  " USS9nUS:X  a  [         R                  " XgSSS9nOUS:X  a  [         R                  " XvSSS9nO[        R                  " X6R
                  S9n[        R                  " [        R                  " U[        R                  " SU-
  5      -   U[        R                  " U5      -   /5      SS9n	[         R                  " XSSS9n
[         R                  " XSSS9nX:-  SU-
  U-  -   nUb	  US	:g  nX   nUS
:X  aX  Ub!  UR                  5       WR                  5       -  $ UR                  5       UR                  S5      UR                  S5      -  -  $ US:X  a  UR                  5       $ US:X  a  UR                  5       $ U$ )a  
Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
of https://huggingface.co/papers/2306.13649 for the definition.

Args:
    student_logits:
        Tensor of shape (batch_size, sequence_length, vocab_size)
    teacher_logits:
        Tensor of shape (batch_size, sequence_length, vocab_size)
    labels:
        Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
        loss
    beta:
        Interpolation coefficient between 0 and 1 (default: 0.5)
    temperature:
        Softmax temperature (default: 1.0)
    reduction:
        Specifies the reduction to apply to the output (default: 'batchmean')

Returns:
    loss: Scalar tensor with the generalized JSD loss
rL   rQ   r   noneT)	reduction
log_targetrR   )r~  r{  	batchmeanro   mean)r   log_softmaxkl_divr(   r   r~  r\   stacklogro   sizer  )student_logitsteacher_logitslabelsr7  r5  r  student_log_probsteacher_log_probsjsdmixture_log_probs
kl_teacher
kl_studentr   s                r>   generalized_jsd_loss'_UnslothGKDTrainer.generalized_jsd_loss  s   8 (5'5 MM.bAMM.bA19((,6^bcCQY((,6^bcC <<,C,CDD %.1t81DDFWZ_ZcZcdhZiFijk! "3RXeijJ"3RXeijJ #q4x:&==C T>D)C #-3-?3779txxz)lSWWYRURZRZ[\R]`c`h`hij`kRkEll%779& 88:JrA   c                 ~   U R                   (       Ga%  U R                  R                  U5      n[        US5      (       a"  UR	                  5       b  UR	                  5       nO[        U[        USS5      U5      nU" US   US   SSS9nU R                  R                  5         U R                  R                  U R                  5      n[        US5      (       a"  UR	                  5       b  UR	                  5       n	O[        U[        USS5      U5      n	[        R                  " 5          U	" US   US   SSS9n
S S S 5        UR                  S S 2S S	24   R                  5       nW
R                  S S 2S S	24   R                  5       nUS
   S:g  n[        R                  " XS   [        R                  " US   S5      5      nUS S 2SS 24   R                  5       nUR                  5       nUR                  5       nU R                  UUR                   UUR                   U[        USS 5      [        USS 5      S9nOU" US   US   S9nU R                  R                  5         [        R                  " 5          U R                  US   US   S9n
S S S 5        US   R"                  S   nUR$                  S S 2US-
  S	2S S 24   nW
R$                  S S 2US-
  S	2S S 24   nUS
   S S 2US 24   nU R'                  UUUU R(                  S9n[+        5         U(       a  UU4$ U$ ! , (       d  f       GN= f! , (       d  f       N= f)Nget_decoderbase_model_prefixr5   ri   attention_maskTF)ri   r  output_hidden_statesr  rL   r  r{  rR   bias)student_inputstudent_weightteacher_inputteacher_weighttrue_labelsstudent_biasteacher_bias)ri   r  prompts)r  r  r  r7  )r  r  unwrap_modelr8   r  r  rm  evalr(   no_gradlast_hidden_state
contiguouswhere	full_likeget_output_embeddingsr  weightrU   r_   r  r7  r   )r9   r5   inputsreturn_outputsnum_items_in_batchunwrapped_studentbase_studentstudent_outputsunwrapped_teacherbase_teacherteacher_outputsstudent_hiddenteacher_hiddenlabels_maskmasked_input_idsr  student_headteacher_headlossprompt_lengthsshifted_student_logitsshifted_teacher_logitsshifted_labelss                          r>   compute_loss_UnslothGKDTrainer.compute_loss  sn   """ $ 0 0 = =e D(-88=N=Z=Z=\=h0<<>&%w/@BUW^'_ar  + -%&67%)	O ##% $ 0 0 = =d>P>P Q(-88=N=Z=Z=\=h0<<>&%w/@BUW^'_ar  ".$[1#)*:#;)-#	# ! ->>q#2#vFQQSN,>>q#2#vFQQSN !*d2K${{K0%//&BUW[2\  +1ab51<<>K -BBDL,BBDL &&,+22,+22'$\64@$\64@ ' D $ -%&67O ##%"&"4"4$[1#)*:#; #5 # ! $I.44Q7N%4%;%;A~PQ?QTV?VXY<Y%Z"%4%;%;A~PQ?QTV?VXY<Y%Z"#H-a.@AN ,,55%YY	 - D 	 +9o&BdBE !T !s   'L4L.
L+.
L<c                     U R                  US   UR                  SS 5      USS9nUR                  n[        R                  " U5      nUR                  5       nUb  SXwU:H  '   SXeU:H  '   XVU4$ )Nr  prompt_attention_maskT)ri   r  r  return_dict_in_generater{  r   )generaterC  	sequencesr(   	ones_likeclone)r5   r  r  rk   generated_outputsgenerated_tokensnew_attention_mask
new_labelss           r>   generate_on_policy_outputs-_UnslothGKDTrainer.generate_on_policy_outputsI  s     "NNY'!::&=tD/$(	 + 
 -66"__-=>%++-
 #59J\12CD<?@Z??rA   r  r  rl   c                 n  > U R                   (       al  [        U R                  U R                  5       nU R	                  XBU R
                  U R                  R                  5      u  pVnSSS5        WUS'   WUS'   WUS'   [        R                  " 5       U R                  ::  aa  [        XR                  5       nU R	                  XBU R
                  U R                  R                  5      u  pVnSSS5        WUS'   WUS'   WUS'   [        T	U ]-  XU5      nU$ ! , (       d  f       N= f! , (       d  f       N@= f)a9  
Perform a training step for the Generalized Knowledge Distillation (GKD) model.

This method implements the on-policy learning approach described in the GKD paper. With probability
`self.lmbda`, it generates new responses using the student model, which are then used for training instead of
the original inputs.
Nri   r  r  )r<  r)   rm  r  r  r  rq  rk   r&   r6  rF  training_step)
r9   r5   r  r  unwrapped_modelnew_input_idsr  r  r  rH  s
            r>   r   _UnslothGKDTrainer.training_step`  s    ;;,T-?-?AQAQRVe@D@_@_#T-C-CTEZEZEgEgA=: S #0F;'9F#$)F8==?djj(,U4D4DE@D@_@_#T-C-CTEZEZEgEgA=: F #0F;'9F#$)F8w$U4FG# SR FEs   5D75D&
D#&
D4
model_namedataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        R                   " S5      n[#        UUU R$                  UU['        5       (       a+  [(        R*                  b  [(        R*                  R,                  OS[/        5       SUS	S
S9nUR1                  [        R
                  R3                  U R4                  R6                  S5      5        g)a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
N_name_or_pathunsloth_versionunslothJOB_IDhf_jobsan          @inproceedings{agarwal2024on-policy,
            title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
            author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
            year         = 2024,
            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=3zKtaqxLhW},
        }GKDzPOn-Policy Distillation of Language Models: Learning from Self-Generated Mistakesz
2306.13649)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror8   r5   configr$   pathisdirr  setr  r  addrB  update
_tag_namesr'   dedentr    r  r"   wandbrunurlr!   savejoinr:   r   )r9   r  r  r  r  citation
model_cards          r>   create_model_card$_UnslothGKDTrainer.create_model_card~  sn   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$ ?? $  )!!**%'9';';		@Ueiimm[_.0%j!

 	TYY%9%9;GHrA   )r7  r  r  r6  r<  rm  r5  r  )NNNNNNNNN)NNNNN)Nr]  rL  r  )FN)N)NNN)*r_  r`  ra  rb  rc  r  r   r	   r   r#   Moduler  r   r   r   dictr   r   r   r   r   r   listr   tupler(   r   	Optimizerlr_schedulerLambdaLRr   rG  staticmethodr  r  r  r   re  r  r  rf  rg  rh  s   @r>   rj  rj    s   %N J CG@D$(04+/EI FJ59Vbhl.2.2!h\oryy#=>?h\ _bii<=h\ y!	h\
  -h\  (h\ uWd3<.@%@ABh\ #)+=?UWeef
h\ "(N+;T+A"BCh\ D12h\ %++//1I1I1R1RRSh\ (0%,,9UW\WcWc9c0d'eh\ l+h\  "(+!h\ h\T ZeD DL\C| @ @. rvYY(,S%c8I2J-J(Kaijman	 @ %)&*,0	BISMBI smBI CcD()	BI BIrA   rj  c                   H   ^  \ rS rSrSr            SU 4S jjrSrU =r$ )UnslothGKDTraineri  a	  
    Trainer for Generalized Knowledge Distillation (GKD) of language models.

For details on GKD, see the paper: [On-Policy Distillation of Language Models: Learning from Self-Generated
Mistakes](https://huggingface.co/papers/2306.13649).

Args:
    model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Model to be trained, or the string identifier of the model to be instantiated from a pretrained model.
    teacher_model ([`~transformers.PreTrainedModel`] or `torch.nn.Module` or `str`, *optional*):
        Teacher model for knowledge distillation, or the string identifier of the model to be instantiated from a
        pretrained model.
    args ([`GKDConfig`], *optional*):
        Training arguments.
    data_collator ([`~transformers.DataCollator`], *optional*):
        Data collator to batch samples from the dataset. It defaults to a [`DataCollatorForChatML`] using the
        `processing_class`.
    train_dataset ([`~datasets.Dataset`], *optional*):
        Dataset for training.
    eval_dataset ([`~datasets.Dataset`] or `dict` of [`~datasets.Dataset`], *optional*):
        Dataset for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
       Class to process the data.
    compute_metrics (`Callable`, *optional*):
        Function to compute metrics at evaluation. Must take in an [`~transformers.EvalPrediction`] and return a
        dictionary string to float.
    callbacks (`list` of [`~transformers.TrainerCallback`], *optional*):
        Callbacks to use during training.
    optimizers (`tuple` of `torch.optim.Optimizer` and `torch.optim.lr_scheduler.LambdaLR`, *optional*, defaults to `(None, None)`):
        Tuple containing the optimizer and the learning rate scheduler to use for training.
    preprocess_logits_for_metrics (`Callable`, *optional*):
        Function to preprocess the logits before computing the metrics. Must take in the `logits` and `labels` and
        return the logits to be used for metrics computation.
    peft_config ([`~peft.config.PeftConfig`], *optional*):
        PEFT configuration to use PEFT for training. If `None`, PEFT is not used. If provided, the `model` will be
        wrapped with the specified PEFT adapter.
    formatting_func (`Callable`, *optional*):
        Function to format the dataset. Must take in an example and return an example.

    c                 2  > Uc
  [        5       n[        USS5      n[        U5      [        La  Sn[        USS5      n[        U5      [        La  SnSn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U(       a  [        S5      eU(       d  U(       d  U(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU(       d<  U(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n[        US'S 5      nUc'  Ub$  UR<                  n[;        US'5      (       a  UUl        Ub!  [;        US(5      (       a  UR?                  5         S)[9        5       ;   a   [;        [@        S*5      (       a  S+[@        l!        S,[9        5       ;   aU  [;        US*5      (       a  S+Ul!        [;        US)5      (       a,  [;        UR@                  S*5      (       a  S+UR@                  l!        S,[9        5       ;   a  UO[@        nSS-K"J#n   [I        UU 5      (       dx  [I        U[J        5      (       a(  S.URL                  ;  a  [O        USS/[        US0S 5      S19nO[I        U[N        5      (       a%  S.URL                  ;   a  [K        U[        US0S 5      S29nOJ[;        US35      (       a  SUl(        [;        US45      (       a  S5Ul)        [;        US65      (       a	  S7S	0Ul*        [I        UU 5      (       dx  [;        US85      (       dg  [;        US)5      (       aV  [I        U[J        5      (       a   [K        UR@                  [        US0S 5      S29nO![O        UR@                  SS/[        US0S 5      S19n/ n!SS9K+J,n"  U"" S:U!5        [        US;S 5      [Z        R\                  :X  a(  UR^                  S:  a  [        US<S5      S:w  a  SUl0        S=[9        5       ;   a!  [;        US(5      (       a  UR?                  5         [b        T%U ]  " SDUUUUUUUUU	U
UUS>.UD6  S=[9        5       ;   a!  [;        US?5      (       a  URg                  5         [;        U S@5      (       a-  U Rh                  Rk                  5         [;        U S@5      (       a  U ?4[        USAS 5      b  U Rl                  UR                  5       l6         [;        U SB5      (       aV  U Rn                  Rp                  n#Un$[;        U$S=5      (       a&  U#U$l9        U$Rt                  n$[;        U$S=5      (       a  M&  U#U$l9         [;        U SC5      (       a.  [w        [y        U Rz                  R|                  5      U 5      U l>        g )ENr   Fr   UNSLOTH_ENABLE_FULL_FINETUNINGr   r   UNSLOTH_FORCE_FLOAT32zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONrX   r~  torch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r   ACCELERATE_MIXED_PRECISIONrp  r   r   r   rN  r   rR   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r      r   r   r   bfloat16rr  ru  UNSLOTH_RETURN_LOGITSr   r6   ry  padding_siderightrq  )UnslothVisionDataCollatorr  rU  r/  )mlmmlm_probabilityr/  )r/  r   r&  rT  r'  rz  pad)PatchRLStatisticsgkd_trainerparallel_mode_n_gpur5   )r5   rm  r:   rn  ro  rp  rq  rr  rs  ru  rv  rw  r7   neftune_hook_handler  r  trainr=  )?r   r  typeboolr$   rB  rC  r>  r  get_input_embeddingsr~  unsloth_zoo.utilsr  r(   float16	TypeErrorr   r   r   r   transformersr  r-   r   r   r   r   r   localsr8   r   r6   ry  r  unsloth_zoo.vision_utilsr  r  r/   column_names+TransformersDataCollatorForLanguageModelingr   r&  r'  unsloth_zoo.logging_utilsr#  r1   NOT_DISTRIBUTEDn_gpur&  rF  rG  r7   r'  remover  r  scaleraccelerator_scalerr5   r2   rD   rH  r(  )&r9   r5   rm  r:   rn  ro  rp  rq  rr  rs  ru  rv  rw  r;   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyper~  r  r-  ga_stepstransformers_versioneval_bszr   r   _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr   _UnslothGKDTrainer__tokenizerr  other_metricsr#  r8  current_modelrH  s&                                        r>   rG  UnslothGKDTrainer.__init__  s     < 0 24/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4G!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW*<*H&iF-)BCC-)?@@XUbUoUoEo K&))07KT)R	! M+VWW\dhu  iC  iC  ]C 6)07KT)R!
 t455TYt7Qt122bD4Kt-..G]_cFd0C-)BCC;..7;3T3Tm-CDD$:#---4T;OQU-V%M
 %P#--#*--4T;OQU-V	%M ?-7 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	8)))'/-!,I%-	8 17	8 fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJrA   )r(  )NNNNNNNNNNNN)r_  r`  ra  rb  rc  rG  rf  rg  rh  s   @r>   r  r    s;    'T (,g grA   r  )Hrc  r(   r   torch.nnr#   r   r   typingr   r   r   r   r	   r
   r   r   trl.trainer.gkd_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r$   r%   r&   r'   r)   dataclassesr+   r,   packaging.versionr-   numpynp
contextlibr.   r/  r/   r0   r3  transformers.training_argsr1   rB   typesr2   rD   torch_compile_optionscompilerh   re  rs   r   r   r   rj  r  r=  rA   r>   <module>rU     s  0    $ I I I O  O  O  O  O  O  O  O  O 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL  ~-y ~- ~-~ pI pIbP* Pd rA   