
    Y:i?3             
          S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJ
r
JrJrJrJrJrJrJrJ r J!r!J"r"JrJ#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.JrJ/r/J0r0J1r1J2r2J3r3JrJ4r4JrJrJrJrJrJ
r
JrJ/r/J0r0J1r1J4r4J
r
JrJ r J-r-J/r/JrJ/r/  SSK/r/SSK7  SSK5J%r%J6r6  SS	K7J8r8  SSKrSSK9r:SS
K$J;r;  SSKJr  SSK4J<r<Jr=  SSK>J?r?  SSK@r@SSKAJBrB  S rC SSSSSS.rD\R                  " SS\DS9S 5       rFS\R                  S\GS\GS\R                  4S jrHS\R                  S\R                  S\GS\GS\R                  4
S jrIS\R                  S\GS\R                  4S jrJ\% " S  S!\5      5       rK  " S" S#\ 5      rL " S$ S%\L5      rM \N" \-S&5      (       a3  SSK.r. " S' S(\.R                  5      rP \-R                  " \P" S)5      5        gg)*z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)<r   
AutoConfigAutoProcessorr   DataCollatorDataCollatorForLanguageModeling%DataCollatorForVisionLanguageModelingDatasetEvalPredictionIterableDatasetr   Path
PeftConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixin	SFTConfig
SFTTrainerTrainerTrainerCallbackTrainingArgumentsr	   clone_chat_template
contextlib	dataclassdefaultdictdft_lossgenerate_model_cardget_act_offloading_ctx_managerget_comet_experiment_urlis_conversationalis_wandb_availableloggerloggingnnospack_datasetpadprepare_peft_modelselective_log_softmaxtorchtransformersr   r   r   r   r   r   r	   r,   r-   r.   r2   r   r   r   r)   r,   r1   r,   )*)r!   field)Version)nullcontextDataCollatorForSeq2Seqr   )ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrr=   r>   r?   )selfargskwargsoutputfs       H/home/james-whalen/llama.cpp/unsloth_compiled_cache/UnslothSFTTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)rE   rG   s   ` rF   prepare_for_training_moderL   /   s%    __Q  NrI   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rW   indexrW      )r1   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrX   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             rF   chunked_selective_log_softmaxrp   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYrI   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
rZ   z8logits_to_keep must be smaller than the sequence length.NrY   )r]   
ValueErrorsum)rq   rr   rs   prompt_sectionpadding_maskpad_token_countss         rF   calculate_pad_tokens_in_promptr{   W   sX     ++STTq"2N?"223N"2L#''A'.rI   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
)devicer   rZ   )r]   r   r1   arangerb   )r|   r}   r~   rs   
batch_sizecompletion_lenr   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               rF    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.JrI   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
rZ   T)rW   
descendingstable)r1   argsortra   )r   r   masksorted_indicespacked_tensors        rF   left_pack_paddingr      s8     D]]4Q4MNLLN;MrI   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'   \" SSS0S9r\\   \	S'                                                                                                                                                        SU 4S jjrSrU =r$ )UnslothSFTConfig   a  
    
Configuration class for the [`SFTTrainer`].

This class includes only the parameters that are specific to SFT training. For a full list of training arguments,
please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may
differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    > Parameters that control the model

    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
        argument of the [`SFTTrainer`] is provided as a string. If you're training a MoE architecture and want to
        include the load balancing/auxilliary loss as a part of the final loss, remember to set
        `output_router_logits=True` in this dictionary.
    chat_template_path (`str` or `None`, *optional*, defaults to `None`):
        If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory
        or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must
        ensure that any special tokens referenced in the template are added to the tokenizer and that the model's
        embedding layer is resized accordingly.

    > Parameters that control the data preprocessing

    dataset_text_field (`str`, *optional*, defaults to `"text"`):
        Name of the column that contains text data in the dataset.
    dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Dictionary of optional keyword arguments for the dataset preparation. The only supported key is
        `skip_prepare_dataset`. When the model is a VLM, `skip_prepare_dataset` is automatically treated as `True`
        regardless of the provided value, since preprocessing is done on the fly.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of processes to use for processing the dataset.
    eos_token (`str` or `None`, *optional*, defaults to `None`):
        Token used to indicate the end of a turn or sequence. If `None`, it defaults to
        `processing_class.eos_token`.
    pad_token (`int` or `None`, *optional*, defaults to `None`):
        Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`,
        it falls back to `processing_class.eos_token`.
    max_length (`int` or `None`, *optional*, defaults to `1024`):
        Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from the right.
        If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length.
    packing (`bool`, *optional*, defaults to `False`):
        Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce
        padding. Uses `max_length` to define sequence length.
    packing_strategy (`str`, *optional*, defaults to `"bfd"`):
        Strategy for packing sequences. Can be either `"bfd"` (best-fit decreasing, default), or `"wrapped"`.
    padding_free (`bool`, *optional*, defaults to `False`):
        Whether to perform forward passes without padding by flattening all sequences in the batch into a single
        continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only
        supported with the FlashAttention 2 or 3, which can efficiently handle the flattened batch structure. When
        packing is enabled with strategy `"bfd"`, padding-free is enabled, regardless of the value of this
        parameter.
    pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`):
        If set, the sequences will be padded to a multiple of this value.
    eval_packing (`bool` or `None`, *optional*, defaults to `None`):
        Whether to pack the eval dataset. If `None`, uses the same value as `packing`.

    > Parameters that control the training

    completion_only_loss (`bool` or `None`, *optional*, defaults to `None`):
        Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed
        only on the completion, which is supported only for [prompt-completion](#prompt-completion) datasets. If
        `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset:
        loss is computed on the completion for [prompt-completion](#prompt-completion) datasets, and on the full
        sequence for [language modeling](#language-modeling) datasets.
    assistant_only_loss (`bool`, *optional*, defaults to `False`):
        Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed only
        on the assistant responses, which is supported only for [conversational](#conversational) datasets. If
        `False`, loss is computed on the entire sequence.
    loss_type (`str`, *optional*, defaults to `"nll"`):
        Type of loss to use. Possible values are `"nll"` (negative log-likelihood, default) and `"dft"` (Dynamic
        Fine-Tuning, as described in [this paper](https://huggingface.co/papers/2508.05629)).
    activation_offloading (`bool`, *optional*, defaults to `False`):
        Whether to offload the activations to the CPU.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsrT   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksz'Maximum sequence length to truncate to.max_seq_lengthc                   > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#Wc$  S
SKJn  [        [	        U" 5       S-   S5      S5      n[
        R                  R                  SS5      S:X  a  S
SKJ	n  U(       a  Wc  S
SKJ
n  Un[        TU ]0  " S0 SU_SU_SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_S U_S!U_S"U_S#U_S$U_S%U_S&U_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U_S2U_S3U _S4U!_S5U"_S6U#_S7U$_S8U%_S9U&_S:U'_S;U(_S<U)_S=U*_S>U+_S?U,_S@U-_SAU._SBU/_SCU0_SDU1_SEU2_SFU3_SGU4_SHU5_SIU6_SJU7_SKU8_SLU9_SMU:_SNU;_SOU<_SPU=_SQU>_SRU?_SSW@_STWA_SUWB_SVWC_SWWD_SXWE_SYWF_SZWG_S[WH_S\WI_S]WJ_S^WK_S_WL_S`WM_SaWN_SbWO_ScWP_SdWQ_SeWR_SfWS_SgWT_ShWU_SiWV_SjWW_SkWX_SlWY_SmWZ_SnW[_SoW\_SpW]_SqW^_SrW__SsW`_StWa_SuWb_SvWc_SwWd_SxWe_SyWf_SzWg_S{Wh_S|Wi_S}Wj_S~Wk_SWl_SWm_SWn_SWo_SWp_SWq_SWr_SWs_SWt_SWu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l        WU l        g )NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rZ   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!steps  unsloth_training_checkpointsnor   	cpu_countrU      @   UNSLOTH_ENABLE_FLEX_ATTENTION01)HAS_FLEX_ATTENTION)FLEX_ATTENTION_BLOCK_SIZE
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16fp16fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesmodel_init_kwargschat_template_pathdataset_text_fielddataset_kwargsdataset_num_proc	eos_token	pad_token
max_lengthpackingpacking_strategypadding_freepad_to_multiple_ofeval_packingcompletion_only_lossassistant_only_loss	loss_typeactivation_offloading )printmultiprocessingr   minmaxr,   environgetunsloth_zoo.flex_attentionr   r   super__init__r   r   r   )rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r:  r;  r<  r   r   r   rC   r   r   r   	__class__s                                                                                                                                                             rF   rF  UnslothSFTConfig.__init__   s   r 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M#1"3y{1}a#8"=::>>93?3FE!&8&@P%>" 	 R	D#R	D#7R	D  R	D 	R	D
 $R	D *R	D $8R	D +FR	D *DR	D (@R	D '>R	D +FR	D '>R	D $R	D '>R	D  *!R	D" (#R	D$ $%R	D& $'R	D( ()R	D* *+R	D,  0-R	D. "/R	D0 !21R	D2 (3R	D4 (5R	D6 "7R	D8 !29R	D:  0;R	D< &=R	D>  0?R	D@ "4AR	DB *CR	DD &<ER	DF *GR	DH $IR	DJ  0KR	DL  0MR	DN !2OR	DP .QR	DR 7^SR	DT UR	DV WR	DX ,YR	DZ [R	D\ "]R	D^ *_R	D`  aR	Db cR	Dd eR	Df ,gR	Dh &<iR	Dj ,kR	Dl ,mR	Dn oR	Dp $qR	Dr &sR	Dt *uR	Dv !2wR	Dx yR	Dz $8{R	D| $}R	D~ &<R	D@ *DAR	DB $CR	DD  ER	DF (GR	DH %:IR	DJ &KR	DL &<MR	DN %:OR	DP !2QR	DR  0SR	DT UR	DV #6WR	DX &YR	DZ 2T[R	D\ "4]R	D^ "4_R	D` "aR	Db &<cR	Dd eR	Df $gR	Dh "iR	Dj .kR	Dl "4mR	Dn "oR	Dp *DqR	Dr !2sR	Dt %:uR	Dv %:wR	Dx -JyR	Dz #6{R	D| *D}R	D~ &R	D@ &<AR	DB (CR	DD (ER	DF "GR	DH  0IR	DJ .KR	DL (MR	DN &<OR	DP -JQR	DR *DSR	DT &<UR	DV (WR	DX $8YR	DZ (@[R	D\ !2]R	D^ *_R	D` $8aR	Db  0cR	Dd &eR	Df "gR	Dh &iR	Dj *kR	Dl %:mR	Dn "4oR	Dp )BqR	Dr -JsR	Dt #6uR	Dv $8wR	Dx "4yR	Dz *{R	D|  0}R	D~ #6R	D@ &<AR	DB -JCR	DD !2ER	DF "4GR	DH "4IR	DJ ,KR	DL  0MR	DN "OR	DP "QR	DR $SR	DT UR	DV  0WR	DX (YR	DZ "4[R	D\ (]R	D^ $8_R	D` #6aR	Db "cR	Dd %:FeR	Df %9!"4,rI   )r   r   r   )NNFFFr   FrU   rU   NNr   r   r      g-C6
?g{Gz?g?g+?g:0yE>g      ?g      @rT   linear皙?r   passivewarningTNr   FrZ   Fr   r   NTFFFFFFO  rN  FFFFO1autoFFNrT   NNF FNr   NrT   NNTNFNNFrQ  r   NNNNN        
adamw_8bitNFFlengthNNNNTFTFFNN
every_saveNNFNTNFTrP  NNNrQ  FFNlasti  FNNFFNNFFFNFTNNtextNNNN   FbfdFNNNFnllFNrT   N)__name__
__module____qualname____firstlineno____doc__r4   r   r   r   __annotations__r   intr   rF  __static_attributes____classcell__rG  s   @rF   r   r      sC   O` +012+(3-  */VW*#  &+EF&NXc]  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) $!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(, !# !## %#m}- }-rI   r   c                    ,  ^  \ rS rSrSrSS/r             S&S\\\R                  \
4   S\\\\4      S\\   S	\\\\4      S
\\\\\\4   4      S\\\\4      S\\   S\\\/\4      S\\\      S\\\R4                  R6                     \\R4                  R8                  R:                     4   S\\\\R4                  R6                     \\\4   4      S\\\R@                  \R@                  /\R@                  4      S\S   S\\\/\4      4U 4S jjjr!S\\\4   S\"S\\\/\4      S\S\\\4   4
S jr#S r$S'U 4S jjr%U 4S jr&S(S\\\'4   S\\'   SS4U 4S  jjjr(U 4S! jr)   S)S"\\   S\\   S#\\\\   S4   4S$ jjr*S%r+U =r,$ )*_UnslothSFTTraineri,  aZ  
Trainer for Supervised Fine-Tuning (SFT) method.

This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.

Example:

```python
from datasets import load_dataset
from trl import SFTTrainer

dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")

trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
          config) with the keyword arguments in `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object.
        If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
        as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
    args ([`SFTConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    data_collator ([`~transformers.DataCollator`] or `None`, *optional*):
        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
        Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
        and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
        [prompt-completion](#prompt-completion) type. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).

        The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
        with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
        If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
    compute_loss_func (`Callable` or `None`, *optional*, defaults to `None`):
        A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
        batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
        function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
        used by [`Trainer`].
    compute_metrics (`Callable[[EvalPrediction], dict]` or `None`, *optional*, defaults to `None`):
        The function that will be used to compute metrics at evaluation. Must take a
        [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
        [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
        `compute_result` argument. This will be triggered after the last eval batch to signal that the function
        needs to calculate and return the global summary statistics rather than accumulating the batch-level
        statistics.
    callbacks (list of [`~transformers.TrainerCallback`] or `None`, *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
        model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
    optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
        A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
        `args`. Incompatible with the `optimizers` argument.

        Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
        initializing the Trainer.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
        A function that preprocess the logits right before caching them at each evaluation step. Must take two
        tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
        by this function will be reflected in the predictions received by `compute_metrics`.

        Note that the labels (second parameter) will be `None` if the dataset does not have them.
    peft_config ([`~peft.PeftConfig`] or `None`, *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    formatting_func (`Callable` or `None`, *optional*, defaults to `None`):
        Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
        converts the dataset into a [language modeling](#language-modeling) type.
trlsftNr=   rB   data_collatortrain_dataseteval_datasetprocessing_classcompute_loss_funccompute_metrics	callbacks
optimizersoptimizer_cls_and_kwargspreprocess_logits_for_metricspeft_configr   formatting_funcc                   > UcP  [        U[        5      (       a  UOUR                  R                  nUR	                  S5      S   n[        U S35      nOe[        U[        5      (       aP  [        U[
        5      (       d;  UR                  5       nUR                  US'   UR                  S5        [        S90 UD6nUR                  =(       d    0 n[        U[        5      (       a  UnUR                  S5      n[        U[        R                  5      (       d	  US:X  d  Uc  O@[        U[        5      (       a  US;   a  [        [        U5      nUUS'   O[        S	U S
35      e[         R"                  " U5      n[        [$        UR&                  S   5      nUR"                  " U40 UD6nO9UR                  R                  nUR                  b  [(        R*                  " S5        Uc  [,        R"                  " U5      n[        U[.        5      (       a  UR0                  nSU l        O*[        U[4        5      (       a
  UnSU l        O[7        S5      eUR8                  bM  UR8                  nUR;                  U5      nUc&  [        SU SUR<                  R>                   S35      eUUl         URB                  b  [D        RF                  RI                  URB                  5      (       aU  URB                  RK                  S5      (       a5  [M        URB                  SS9 nURO                  5       Ul(        S S S 5        / nO[S        XURB                  5      u  pnO/ nU R2                  (       a  URT                  (       a  [        S5      eU R2                  (       a  URV                  (       a  [        S5      eU R2                  (       a  URX                  (       a  [        S5      e SU l1         URV                  =(       d"    URT                  =(       a    URj                  S:H  U l+        UR                  Rl                  S;   nU RV                  (       a  Ub  [        S5      eURT                  (       a&  URj                  S:X  a  [(        R*                  " S5        U(       d  [(        R*                  " S5        URn                  S:X  a'  URT                  (       d  [(        R*                  " S5        [q        [s        U5      5      nURt                  c  S U;   =(       a    S!U;   U l:        OURt                  U l:        Uc  U R2                  (       d  URv                  =(       d    URv                  =(       d    UR8                  nUR;                  U5      nUc&  [        S"U SUR<                  R>                   S#35      e[y        UU Rt                  U RV                  UURz                  S$9nOIUcF  U R2                  (       a5  [}        UUR~                  U Rt                  URz                  UR                  S%9nURT                  (       a-  URj                  S:X  a  U(       d  [(        R*                  " S&5        URX                  (       a  [        U5      (       d  [        S'5      eUR                  S L=(       a    UR                  R                  S(S5      =(       d    U R2                  n U (       d  U Rt                  (       a  U(       a  [        S)5      eU R                  XFX"RT                  US*5      nUb  UR                  c  URT                  OUR                  n![        U[        5      (       a:  UR                  5        V"V#s0 s H  u  n"n#U"U R                  U#XbU!UU"5      _M     nn"n#OU R                  XVUU!US+5      nUR                  S,:X  a  O>UR                  S-:X  a  Ub  [        S.5      e[        nO[        S/UR                   S035      e[        [        5      [        [        5      S1.U lK        SU lL        [        T$U G]9  UUUUUUUUU	U
UUS29  U R                  R                  (       a  [        U R                  S39U lS        O[        R                  " 5       U lS        [        U R                  S45      (       a%  U R                  R                  U R                  5        [        UR                  S5S5      U lY        [        UR                  S6S75      U lZ        U R                  (       a(  U R                  S7:X  a  [(        R*                  " S85        g g g ! , (       d  f       GN= fs  sn#n"f ):N/rT   z-SFTr  r  dtyperP  )bfloat16float16r`   zInvalid `dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing a valid `torch.dtype` (e.g., 'float32'), but got .r   zYou passed `model_init_kwargs` to the `SFTConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.FzWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`zThe specified `eos_token` ('zC') is not found in the vocabulary of the given `processing_class` (zX). Ensure that the `eos_token` exists in the vocabulary before using it as an EOS token.)z.jinjaz.j2zutf-8)encodingzaPacking is not supported for vision-language models. Please set `packing=False` in the SFTConfig.zzPadding-free training is yet not supported for vision-language models. Please set `padding_free=False` in the `SFTConfig`.zAssistant-only loss is not yet supported for vision-language models. Please set `assistant_only_loss=False` in the `SFTConfig`.rY  )flash_attention_2flash_attention_3z"kernels-community/vllm-flash-attn3zHPassing a custom data collator is not supported when using padding-free.wrappedzYou are passing `padding_free=True` with the 'wrapped' packing strategy, which is not recommended. Please refer to the documentation to understand why this is not recommended.a  Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.rZ   zYou are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size to at least 2.prompt
completionzThe specified `pad_token` ('z[). Ensure that the `pad_token` exists in the vocabulary before using it as a padding token.)rs   r9  r6  return_position_idsr7  )	processorr3  r9  r7  r.  a$  You are using packing, but the attention implementation is not set to 'flash_attention_2' or 'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash Attention is the only known attention mechanisms that reliably support this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or `attn_implementation='kernels-community/vllm-flash-attn3'` in the model configuration.zYou set `assistant_only_loss=True`, but the dataset is not conversational. This option is only supported for conversational datasets.skip_prepare_datasetaE  A formatting function was provided while `completion_only_loss=True`, which is incompatible. Using a formatter converts the dataset to a language modeling type, conflicting with completion-only loss. To resolve this, apply your formatting function before passing the dataset, or disable `completion_only_loss` in `SFTConfig`.trainevalrZ  dftzYou passed a `compute_loss_func` together with `loss_type='dft'` to the `SFTTrainer`. When using `loss_type='dft'`, the loss function is internally set to the DFT loss, so passing a `compute_loss_func` is not allowed.zInvalid `loss_type` z. passed. Supported values are 'nll' and 'dft'.)r  r  )r=   rB   ri  rj  rk  rl  rm  rn  ro  rp  rq  rr  )r=   add_model_tagsoutput_router_logitsrouter_aux_loss_coefrR  a-  You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to `0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary loss.r=  )[
isinstancestrconfig_name_or_pathsplitr   r   to_dictr  popr,  rC  r1   rw  getattrrv   r   from_pretrainedr2   architecturesr)   rM  r   r   	tokenizer_is_vlmr   	TypeErrorr1  convert_tokens_to_idsrG  r[  eos_token_idr-  r,   pathisfileendswithopenreadchat_templater   r4  r6  r:  trainable_token_indicesextendmodules_to_savere   num_virtual_tokensr/   active_adapterrs  r5  _attn_implementationr   nextiterr9  r2  r   r7  r   r3  r.  r'   r/  _prepare_datasetr8  dictitemsr;  r#   r"   list_metrics_total_train_tokensrE  rF  rB   r<  r%   r=    maybe_activation_offload_contextr    r6   r@   r  
_tag_namesaux_loss_enabledaux_loss_coef)%rA   r=   rB   ri  rj  rk  rl  rm  rn  ro  rp  rq  rr  rs  rt  
model_name	dict_argsr,  model_idrw  r  architecturer  r1  r  chat_template_fileadded_tokenspeft_model_configuse_flash_attentiondataset_sampler2  rs   r  r4  keydatasetrG  s%                                       rF   rF  _UnslothSFTTrainer.__init__  sJ   $ <",UC"8"8ell>X>XJ#))#.r2J
|401D/00D)9T9TI%)^^Ik"MM-.)y)D !228beS!!H%))'2E%--&EME3''E5W,Wu--2!'* HHMwaQ   //9F"<1E1Ea1HIL 00O=NOE||11H%%1? #,<<XF &77(22I DL(*ABB(I DLuvv>>%I$::9EL# 29+ >++;+E+E+N+N*O PII 
 &2I""".ww~~d55664;R;R;[;[\m;n;n$11GDHZ5G5L5L5N$2 E!8KT-D-D95 L <<DLLs  <<D--;  <<D44B  4 #$ !--b$,,2a4CXCX\aCa#ll?? D
 

 ( !kll|| 5 5 Bp 'J //14T\\% d=12$$,(0N(B(e|WeGeD%(,(A(AD%  T)*=*=TATATI$::9EL# 29+ >++;+E+E+N+N*O PLL 
 <)%)%>%>!..$7#'#:#:M "t||A*??%)%>%>#'#:#:#'#:#:M <<D11U:CVNNi ##,=n,M,M9  t+f0C0C0G0GH^`e0fvjnjvjv 	 $((_ Q  !11||_V]M '*.*;*;*C$,,IZIZlD11 -9,>,>,@$,@LC T227<LT[]lnqrr,@ ! $L
 $(#8#8$gX^$L
 >>U"^^u$ , : 
 !)3DNN3CCqrss #.d"3[=NO#$  	''%-/+!%=*G 	 	
  99**4RY]YcYc4dD14>4J4J4LD1 4::/00JJ%%doo6 '6Le T$U\\3I3O  T%7%73%>NN &? C EDJ$s   d2##e2
er  r4  dataset_namert   c           	        ^^^^^^^  [        U[        5      (       a  U$  0 n[        U[        5      n[        US5      n	UmU	(       a  UR                  m[        USS5      mTS:X  a  [        USS5      mTS:X  a  [        U SS5      mTS:X  a  [        U SS5      mTS:X  a  [        S5      e[        USS5      mTS:g  mS	mS
n
[        [        [        U5      5      R                  5       5      nS/nSU;   a  UR                  S5        SSKJnJn  SU;   aR  U	(       a*  [        TS5      (       d  [        SUR                   S35      eU" T5      U l        UR                  S5        S	n
O\SU;   a@  U	(       a*  [        TS5      (       d  [        SUR                   S35      eU" TS	S9U l        S	n
OTU;  a  S
mTc  [        S5      e U
(       Ga  T(       a@  T" [        [        U5      5      5      n[        U["        5      (       d  [%        S5      eUS   nO[        [        U5      5      T   S   n[        USS5      nUS:X  a  U	(       a  [        TSS5      nUc  SnS
m[        USS 5      n[        TSS 5      nU=(       d    UnUb)  UR'                  U5      (       d  UU;   a  S	m[)        S5         UUUUUUU4S jn [        U[*        5      (       d0  [        USS 5      nUc  SSKJn  [1        U" 5       S-   S5      nUUS'   OUR2                  R4                  US'   U(       a	  S T S!3US"'   UR6                  " U4S#S
0UD6nU	(       a  [        US5      (       d  U" TS	S9nUU l          U(       aP   [8          TS:X  a  [%        S%5      eU(       a	  S&U S'3US"'   [9        UR;                  U5      T[        US(S)5      U5      n U$ !    GN= f!   [)        S$5        Us $ = f)*Nr  r3  r   r   max_seqz1Unsloth: max_seq_length is 0! Please specify one!r.  rW  FTrq   attention_maskr7   labelsr.   z	Unsloth: z does not have .pad!)mlmz-Unsloth: You must specify a `formatting_func`zIUnsloth: The `formatting_func` should return a list of processed strings.r  rQ  	bos_tokenzHUnsloth: We found double BOS tokens - we shall remove one automatically.c                 <   > T" T(       d  U T   OT" U 5      TTSTS9$ )NF)
truncationr3  return_token_type_idsadd_special_tokensr=  )exampler  r.  do_formatting_funcdo_truncationrt  r   r  s    rF   	_tokenize6_UnslothSFTTrainer._prepare_dataset.<locals>._tokenize   s/     7IG./_fOg!.!/,1); rI   r0  r   rU   r   num_procr   zUnsloth: Tokenizing ["z"]descbatchedzPUnsloth: Hugging Face's packing is currently buggy - we're disabling it for now!z:When packing is enabled, `max_seq_length` can't be `None`.zUnsloth: Packing z datasetr5  rY  )r  ConstantLengthDatasetr   r@   r  r  RuntimeErrorsetr  r  keysre   r2   r8   r   rG  ri  r  rv   
startswithr>  r   r?  r   rA  _ex_iterabler   mapr-   select_columns)rA   r  rl  rB   r4  rt  r  
map_kwargsuse_descis_vlmdo_tokenizecolumn_namesused_column_namesr8   r   	test_textr  bos_token_1bos_token_2r  r  r0  r   ri  r  r.  r  r  r   r  s        `                  @@@@@@rF   r  #_UnslothSFTTrainer._prepare_dataset  s   	'#899'>9 
gw/);7$	/999 !|Q7Q?OQR1SQ?OQR1SQy!1LQl3f&g g$T+?H&!+" 4W.3356(M|+$$%56 	Y|#gi77"Y/?/I/I.JJ^#_``!7	!BD$$X.KL(gi77"Y/?/I/I.JJ^#_``!@RW!XDK|3!%&"#RSS!+Dg,?@	!)T22$c  &aL	 g/0BCAF	 $$4orJM"v '	?B G$ " "&!"2KFK!)[$?K#2{I$''	22i=6P).&de  g77#*41CT#J #+9'*9;q=!'<$)9
:&+2+?+?+J+J
<(0FGYFZZ\.]F+kk)JtJzJG g&6>> ?	QV W%2"
 " !]^^0A,x.XF+"&&'890%8	G 	A	`his   N> ,O >OOc                 h    U R                   c%  U R                  (       a
  / SQU l         g / SQU l         g g )N)messagesr  r  images)rq   r  seq_lengthscompletion_maskassistant_masks)_signature_columnsr  )rA   s    rF    _set_signature_columns_if_needed3_UnslothSFTTrainer._set_signature_columns_if_neededQ  s,    
 ""*||*X'*v'	 +rI   c                 (   > [         TU ]  UUUUS9nU$ )N)return_outputsnum_items_in_batch)rE  compute_loss)rA   r=   inputsr  r  outputsrG  s         rF   r  _UnslothSFTTrainer.compute_loss\  s*    '&+!3	 ' 
 rI   c                 r   > U R                      [        TU ]  " U0 UD6sS S S 5        $ ! , (       d  f       g = fN)r  rE  training_step)rA   rB   rC   rG  s      rF   r   _UnslothSFTTrainer.training_stepf  s*    227($9&9 322s   (
6logs
start_timec           	        > U R                   R                  (       a  SOSnU R                  U   R                  5        VVs0 s H  u  pEU[	        U5      [        U5      -  _M     nnnUS:X  a(  UR                  5        VVs0 s H  u  pESU 3U_M     nnnUR                  U5        [        TU ]!  X5        U R                  U   R                  5         g s  snnf s  snnf )Nr  r  eval_)
r=   trainingr  r  rw   lenupdaterE  logclear)rA   r  r  moder  valmetricsrG  s          rF   r  _UnslothSFTTrainer.logj  s    **--w6<@MM$<O<U<U<WX<W3C3s8++<WX 6>:A--/J/hcse}c)/GJGD%d!!# Y
 Ks   $CCc                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )Nrv  rT   )r  )	rB   r  r   r   namer  create_model_cardrE  _save_checkpoint)rA   r=   trialr  rG  s       rF   r  #_UnslothSFTTrainer._save_checkpointx  sj    99!!)dii22388J//55c:2>J*5 .rI   r  tagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        UUU R                   U[#        U5      [%        5       (       a+  [&        R(                  b  [&        R(                  R*                  OS[-        5       SS9nUR/                  [        R
                  R1                  U R2                  R4                  S	5      5        g)
a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
Nr  unsloth_versionunslothJOB_IDhf_jobsSFT)
base_modelr  r  r  r	  	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zeror@   r=   r  r,   r  isdirr  r  r  r  addrB  r  r  r$   r  r  r(   wandbrunurlr&   savejoinrB   r   )rA   r  r  r	  r  
model_cards         rF   r  $_UnslothSFTTrainer.create_model_card  sR   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$(!!**%d'9';';		@Ueiimm[_.0	

 	TYY%9%9;GHrI   )r  r  r  r  r  r  r9  ri  r  r  r6  )NNNNNNNN)NNNNNN)FNr  )NNN)-r[  r\  r]  r^  r_  r  r	   r  r+   Moduler   r   r   r   r   r   r   r  r   r   r   r   r  r   tupler1   r   	Optimizerlr_schedulerLambdaLRtyper   r   rF  boolr  r  r  r  floatr  r  r  rb  rc  rd  s   @rF   rf  rf  ,  s   Xt J
 ?C04CGEIUY04FJ59jvaehl.2;?xS"))_45x uY(99:;x  -	x
  g&> ?@x uWd3<.@%@ABx #5)@.)P#QRx $H-x "(N+;T+A"BCx D12x (5;;#8#898EKKD\D\DeDe;ffgx #+5ekk6K6K1LdSVX[S[n1\+]"^x (0%,,9UW\WcWc9c0d'ex l+x "(D63;"78x xt	Lw/0L
 L "(D63;"78L L 
w'	(L\	w:$S%Z( $huo $QU $ $/ %)&*,0	4ISM4I sm4I CcD()	4I 4IrI   rf  c                   H   ^  \ rS rSrSr            SU 4S jjrSrU =r$ )UnslothSFTTraineri  ad  
    
Trainer for Supervised Fine-Tuning (SFT) method.

This class is a wrapper around the [`~transformers.Trainer`] class and inherits all of its attributes and methods.

Example:

```python
from datasets import load_dataset
from trl import SFTTrainer

dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")

trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset)
trainer.train()
```

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using `<ModelArchitecture>.from_pretrained` (where `<ModelArchitecture>` is derived from the model
          config) with the keyword arguments in `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object.
        If you're training a model with an MoE architecture and want to include the load balancing/auxilliary loss
        as a part of the final loss, remember to set the `output_router_logits` config of the model to `True`.
    args ([`SFTConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    data_collator ([`~transformers.DataCollator`] or `None`, *optional*):
        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
        Will default to [`~trainer.sft_trainer.DataCollatorForLanguageModeling`] if the model is a language model
        and [`~trainer.sft_trainer.DataCollatorForVisionLanguageModeling`] if the model is a vision-language model.
    train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
        Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and
        [prompt-completion](#prompt-completion) type. The format of the samples can be either:

        - [Standard](dataset_formats#standard): Each sample contains plain text.
        - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
          and content).

        The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field.
    eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
        Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`] or `None`, *optional*, defaults to `None`):
        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
        with [`~transformers.AutoProcessor.from_pretrained`]. A padding token, `tokenizer.pad_token`, must be set.
        If the processing class has not set a padding token, `tokenizer.eos_token` will be used as the default.
    compute_loss_func (`Callable` or `None`, *optional*, defaults to `None`):
        A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated
        batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss
        function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618)
        used by [`Trainer`].
    compute_metrics (`Callable[[EvalPrediction], dict]` or `None`, *optional*, defaults to `None`):
        The function that will be used to compute metrics at evaluation. Must take a
        [`~transformers.EvalPrediction`] and return a dictionary string to metric values. When passing
        [`SFTConfig`] with `batch_eval_metrics` set to `True`, your `compute_metrics` function must take a boolean
        `compute_result` argument. This will be triggered after the last eval batch to signal that the function
        needs to calculate and return the global summary statistics rather than accumulating the batch-level
        statistics.
    callbacks (list of [`~transformers.TrainerCallback`] or `None`, *optional*, defaults to `None`):
        List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
        in [here](https://huggingface.co/docs/transformers/main_classes/callback).

        If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
        method.
    optimizers (`tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]]`, *optional*, defaults to `(None, None)`):
        A tuple containing the optimizer and the scheduler to use. Will default to an instance of `AdamW` on your
        model and a scheduler given by [`~transformers.get_linear_schedule_with_warmup`] controlled by `args`.
    optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`):
        A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in
        `args`. Incompatible with the `optimizers` argument.

        Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before
        initializing the Trainer.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`):
        A function that preprocess the logits right before caching them at each evaluation step. Must take two
        tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
        by this function will be reflected in the predictions received by `compute_metrics`.

        Note that the labels (second parameter) will be `None` if the dataset does not have them.
    peft_config ([`~peft.PeftConfig`] or `None`, *optional*, defaults to `None`):
        PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    formatting_func (`Callable` or `None`, *optional*, defaults to `None`):
        Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly
        converts the dataset into a [language modeling](#language-modeling) type.

    c                   > Uc
  [        5       n[        USS5      n[        U5      [        La  Sn[        USS5      n[        U5      [        La  SnSn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U(       a  [        S5      eU(       d  U(       d  U(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU(       d<  U(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n[        US'S 5      nUc'  Ub$  UR<                  n[;        US'5      (       a  UUl        S([9        5       ;  a  [;        US(5      (       d  O[;        US'5      (       aL  UR<                  b?  UR<                  S:  a/  [;        US(5      (       a  UR<                  Ul        UR>                  n O~[        US'S 5      n!U!c  [        US(S 5      n!U!b  U!Ul        UR>                  n OJ[;        US(5      (       a'  UR>                  b  UR>                  n [A        US'U 5        O[        S)5        S*Ul        Ub!  [;        US+5      (       a  URC                  5         S,[9        5       ;   a  [;        W"S-5      (       a  S.U"l"        S/[9        5       ;   aU  [;        US-5      (       a  S.Ul"        [;        US,5      (       a,  [;        URF                  S-5      (       a  S.URF                  l"        S/[9        5       ;   a  UOW"n#SS0K$J%n$  [M        UU$5      (       dx  [M        U[N        5      (       a(  S1URP                  ;  a  [S        U#SS2[        US3S 5      S49nO[M        U[R        5      (       a%  S1URP                  ;   a  [O        U#[        US3S 5      S59nOJ[;        US65      (       a  SUl*        [;        US75      (       a  S8Ul+        [;        US95      (       a	  S:S	0Ul,        [M        UU$5      (       dx  [;        U#S;5      (       dg  [;        U#S,5      (       aV  [M        U[N        5      (       a   [O        U#RF                  [        US3S 5      S59nO![S        U#RF                  SS2[        US3S 5      S49n/ n%SS<K-J.n&  U&" S=U%5        [        R
                  R                  S>S85      R_                  S?5      n'SS@K0J1n(  SSAK2J3n)  S,[9        5       ;  a  Un"U(" UW"UU'SBSC9  U)" UU"U5        [        USDS 5      [h        Rj                  :X  a(  URl                  S:  a  [        USES5      S:w  a  SUl7        SF[9        5       ;   a!  [;        US+5      (       a  URC                  5         [p        T,U ]  " SMUUUUUUUUU	U
UUUSG.UD6  SF[9        5       ;   a!  [;        USH5      (       a  URu                  5         [;        U SI5      (       a-  U Rv                  Ry                  5         [;        U SI5      (       a  U ?;[        USJS 5      b  U Rz                  UR                  5       l=         [;        U SK5      (       aV  U R|                  R~                  n*Un+[;        U+SF5      (       a&  U*U+l@        U+R                  n+[;        U+SF5      (       a  M&  U*U+l@         [;        U SL5      (       a.  [        [        U R                  R                  5      U 5      U lE        g )NNr   Fr   UNSLOTH_ENABLE_FULL_FINETUNINGr   r   UNSLOTH_FORCE_FLOAT32zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONr`   rw  torch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r   ACCELERATE_MIXED_PRECISIONrk  r   r   r   rK  r   rZ   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r      r   r   r   rx  rn  rr  UNSLOTH_RETURN_LOGITSr   r3  zgUnsloth: We did not find `max_seq_length` or `max_length` in the model or args. We will set it to 1024.rX  r>   r  padding_siderightrl  )UnslothVisionDataCollatorr  rR  r7  )r  mlm_probabilityr7  )r7  r   r.  rQ  r/  r  r.   )PatchRLStatisticssft_trainerUNSLOTH_IGNORED_TOKENIZER_NAMES
)fix_untrained_tokens)fix_zero_training_lossgؗҜ<)epsparallel_mode_n_gpur=   )r=   rB   ri  rj  rk  rl  rm  rn  ro  rq  rr  rs  rt  r?   neftune_hook_handler$  acceleratorr  r=  )Fr   r  r#  r$  r,   rB  rC  r>  r  get_input_embeddingsrw  unsloth_zoo.utilsr-  r1   ry  r  r   r   r   r   r2   r/  r5   r   r   r   r   r   localsr@   r   r3  setattrr>   r2  r  unsloth_zoo.vision_utilsr4  r  r8   r  +TransformersDataCollatorForLanguageModelingr   r.  r/  unsloth_zoo.logging_utilsr6  r  unsloth_zoo.tokenizer_utilsr:  unsloth_zoo.training_utilsr;  r9   NOT_DISTRIBUTEDn_gpur>  rE  rF  r?   r?  remover$  r@  scaleraccelerator_scalerr=   r:   rL   rG  r  )-rA   r=   rB   ri  rj  rk  rl  rm  rn  ro  rq  rr  rs  rt  rC   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyperw  r-  ry  ga_stepstransformers_versioneval_bszr   r   _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr   r3  model_max_lengthr  _UnslothSFTTrainer__tokenizerr4  other_metricsr6  IGNORED_TOKENIZER_NAMESr:  r;  rM  current_modelrG  s-                                               rF   rF  UnslothSFTTrainer.__init__  s   " < 0 24/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4Gvx'l0K0Kt-..43F3F3RW[WjWjmnWn4..&*&9&9DO!%J#*52BD#I #+|]a@b-=#/&6DO!%JT<00T__5P!%JE#3Z@  D  E&*DO!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW*<*H&iF-)BCC-)?@@XUbUoUoEo K&))07KT)R	! M+VWW\dhu  iC  iC  ]C 6)07KT)R!
 t455TYt7Qt122bD4Kt-..G]_cFd0C-)BCC;..7;3T3Tm-CDD$:#---4T;OQU-V%M
 %P#--#*--4T;OQU-V	%M ?-7"$**..1RTV"W"]"]^b"cDFfh&4D	UI}>U]bcui? 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	8))'/ 1-!'?,I%-	8 17	8 fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJrI   )r  )NNNNNNNNNNNN)r[  r\  r]  r^  r_  rF  rb  rc  rd  s   @rF   r'  r'    s<    Z|  #'(,C CrI   r'  	addFilterc                        \ rS rSrS rS rSrg)HideLoggingMessagei  c                     Xl         g r  rW  )rA   rW  s     rF   rF  HideLoggingMessage.__init__  s    d)rI   c                 <    U R                   UR                  5       ;  $ r  )rW  
getMessage)rA   xs     rF   filterHideLoggingMessage.filter  s    alln)DErI   rd  N)r[  r\  r]  r^  rF  ri  rb  r=  rI   rF   rb  rb    s    2ErI   rb  z`use_cache=True`)Rr_  r1   r   torch.nnr+   r   Ftypingr   r   r   r   r	   r
   r   r   trl.trainer.sft_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   r-   r.   r/   r0   r2   dataclassesr4   packaging.versionr5   numpynpr6   r8   rF  transformers.training_argsr9   rJ   typesr:   rL   torch_compile_optionscompilerp   ra  r{   r   r   r   rf  r'  r@   Filterrb  r`  r=  rI   rF   <module>rx     s  0    $ I I I Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y  Y 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL  Z-y Z- Z-v H
I H
IR_* _B	  6;FW^^ F 	
'(:;<  rI   