
    Y:iS              
       ~   S r SSKJr  SSKrSSKJr  SSKJr  SSKJrJ	r	J
r
JrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJrJrJrJ
r
JrJrJrJ r J!r!J"r"J#r#J$r$JrJ%r%J&r&J'r'J(r(J)r)J*r*J+r+JrJ,r,J
r
JrJ r J#r#J'r'J)r)J+r+Jr  SSK+r+SSK7  SSK-J.r.J/r/  SS	K0J1r1  SSKrSSK2r3SS
K4J5r5  SSKJr  SSK6JrJr7  SSK8J9r9  SSK:r:SSK;J<r<  S r= SSSSSS.r>\R~                  " SS\>S9S 5       r@S\R                  S\AS\AS\R                  4S jrBS\R                  S\R                  S\AS\AS\R                  4
S jrCS\R                  S\AS\R                  4S jrD\. " S  S!\5      5       rE  " S" S#\#5      rF " S$ S%\F5      rG \H" \)S&5      (       a3  SSK*r* " S' S(\*R                  5      rJ \)R                  " \J" S)5      5        gg)*z;
2025.10.10
2025.10.9
4.56.2
0.23.0
__UNSLOTH_VERSIONING__
    )TensorN)
functional)AnyListOptionalTupleUnionDictSetCallable)(AutoModelForCausalLMAutoTokenizerBaseImageProcessorr   DataCollatorDataCollatorForLanguageModelingDataCollatorForSeq2Seq
DataLoaderDatasetEvalLoopOutputFeatureExtractionMixinIterativeSFTConfigIterativeSFTTrainerr   PPODecoratorsPath	PeftModelPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainingArgumentsr	   generate_model_cardget_comet_experiment_urlis_peft_availableis_wandb_availableloggerloggingostorchwarningsr   r   r   r   r#   r%   r'   r(   )*)	dataclassfield)Version)nullcontext)r   r   )ParallelMode)
MethodTypec                 F   ^  [         R                  " T 5      U 4S j5       nU$ )Nc                 8  > [        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         T" U /UQ70 UD6n[        U S5      (       a5  [        U R                  S5      (       a  U R                  R                  5         U$ )Nmodelfor_trainingfor_inference)hasattrr3   r4   r5   )selfargskwargsoutputfs       Q/home/james-whalen/llama.cpp/unsloth_compiled_cache/UnslothIterativeSFTTrainer.pywrapper*prepare_for_training_mode.<locals>.wrapper0   sx     4!!gdjj.&I&IJJ##%4)$)&)4!!gdjj/&J&JJJ$$&    )	functoolswraps)r;   r=   s   ` r<   prepare_for_training_moderB   /   s%    __Q  Nr?   TF)epilogue_fusionmax_autotuneshape_paddingztrace.enabledztriton.cudagraphs)dynamic	fullgraphoptionsc                 d   [         R                  " U R                  SU R                  S   5      SSS9n[         R                  " UR                  S5      SSS9n/ n[	        X#5       H  u  pVUR                  [         R                  5      n[         R                  " USUR                  S5      S9R                  S5      n[         R                  " USS9nXx-
  n	UR                  U	5        M      [         R                  " U5      nUR                  U R                  S   U R                  S   45      nU$ )N   r   )chunksdim)rM   indexrM      )r(   chunkreshapeshapeziptofloat32gather	unsqueezesqueeze	logsumexpappendconcat)
logitsrN   chunked_logitschunked_indexall_per_token_logpschunk_logitschunk_indexselected_logitslogsumexp_valuesper_token_logpss
             r<   chunked_selective_log_softmaxrf   E   s    [[FLL4D!EPQYZ[N[[r!2QaHM%(%G!#u}}5,,|2{G\G\]_G`aiijlm ??<rB)<""?3 &H 	,,':;-55v||AUV6XYr?   	input_idslogits_to_keeppad_token_idreturnc                 ~    XR                   S   :  a  [        S5      eU SS2SU* 24   nX2:H  nUR                  SS9nU$ )zr
Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens 
rP   z8logits_to_keep must be smaller than the sequence length.NrO   )rS   
ValueErrorsum)rg   rh   ri   prompt_sectionpadding_maskpad_token_countss         r<   calculate_pad_tokens_in_promptrq   W   sX     ++STTq"2N?"223N"2L#''A'.r?   completion_input_idsleft_pad_tokens_per_promptmax_left_padc                     U R                   u  pEU R                  nX!-
  n[        R                  " XVS9R	                  S5      nXR	                  S5      :  n	X:g  n
X-  nU$ )a)  
Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]

Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
and pad are pad tokens, this function would make a completion mask that would 0 out the pad
and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
)devicer   rP   )rS   rv   r(   arangerX   )rr   rs   rt   ri   
batch_sizecompletion_lenrv   num_tokens_to_maskindices
shift_masknon_padding_mask
final_masks               r<    create_completion_attention_maskr   j   si     "6!;!;J!((F%Bll>9CCAFG88;;J,<.Jr?   tensorpad_idc                 l    X:g  n[         R                  " USSSS9n[         R                  " U SU5      nU$ )zD
Moves all padding tokens in each sequence of a batch to the right.
rP   T)rM   
descendingstable)r(   argsortrW   )r   r   masksorted_indicespacked_tensors        r<   left_pack_paddingr      s8     D]]4Q4MNLLN;Mr?   c                     ^  \ rS rSr% Sr\" SSS0S9r\\   \	S'   \" SSS	0S9r
\\   \	S
'   \" SSS0S9r\\   \	S'                                                                                                                                           SU 4S jjrSrU =r$ )UnslothIterativeSFTConfig   a  
    
Configuration class for the [`IterativeSFTTrainer`].

<Tip warning={true}>

The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].

</Tip>

This class includes only the parameters that are specific to Iterative SFT training. For a full list of training
arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
class may differ from those in [`~transformers.TrainingArguments`].

Using [`~transformers.HfArgumentParser`] we can turn this class into
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
command line.

Parameters:
    > Parameters that control the model

    model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
        Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
        argument of the [`IterativeSFTTrainer`] is provided as a string.

    > Parameters that control the data preprocessing

    max_length (`int` or `None`, *optional*, defaults to `None`):
        Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated.
    truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
        The truncation mode to use, either `"keep_end"` or `"keep_start"`.
    optimize_device_cache (`bool`, *optional*, defaults to `False`):
        Whether to optimize accelerator cache for slightly more memory-efficient training.

    NhelpzvLLM SamplingParams)defaultmetadatavllm_sampling_paramsrJ   z8Chunk size to reduce memory usage. -1 is most efficient.unsloth_num_chunksz'Maximum sequence length to truncate to.max_seq_lengthc                   > US:  a  [        SU S35        US:  a  [        SU S35        Uc  U#S:X  a
  U$S:X  a  SnS	n#[        TU ]  " S0 S
U_SU_SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_S U_S!U_S"U_S#U_S$U_S%U_S&U_S'U_S(U_S)U _S*U!_S+U"_S,U#_S-U$_S.U%_S/U&_S0U'_S1U(_S2U)_S3U*_S4U+_S5U,_S6U-_S7U._S8U/_S9U0_S:U1_S;U2_S<U3_S=U4_S>U5_S?U6_S@U7_SAU8_SBU9_SCU:_SDU;_SEU<_SFU=_SGU>_SHU?_SIW@_SJWA_SKWB_SLWC_SMWD_SNWE_SOWF_SPWG_SQWH_SRWI_SSWJ_STWK_SUWL_SVWM_SWWN_SXWO_SYWP_SZWQ_S[WR_S\WS_S]WT_S^WU_S_WV_S`WW_SaWX_SbWY_ScWZ_SdW[_SeW\_SfW]_SgW^_ShW__SiW`_SjWa_SkWb_SlWc_SmWd_SnWe_SoWf_SpWg_SqWh_SrWi_SsWj_StWk_SuWl_SvWm_SwWn_SxWo_SyWp_SzWq_S{Wr_S|Ws_S}Wt_S~Wu_SWv_SWw_SWx_SWy_SWz_SW{_SW|_SW}_SW~_SW_SW_SW_SW_SW_SW_SW_WD6  WU l        WU l        WU l        g )NgHz>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rP   za` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!steps  unsloth_training_checkpointsno
output_diroverwrite_output_dirdo_traindo_eval
do_predicteval_strategyprediction_loss_onlyper_device_train_batch_sizeper_device_eval_batch_sizeper_gpu_train_batch_sizeper_gpu_eval_batch_sizegradient_accumulation_stepseval_accumulation_steps
eval_delaytorch_empty_cache_stepslearning_rateweight_decay
adam_beta1
adam_beta2adam_epsilonmax_grad_normnum_train_epochs	max_stepslr_scheduler_typewarmup_ratiowarmup_steps	log_levellog_level_replicalog_on_each_nodelogging_dirlogging_strategylogging_first_steplogging_stepslogging_nan_inf_filtersave_strategy
save_stepssave_total_limitsave_safetensorssave_on_each_nodesave_only_model'restore_callback_states_from_checkpointno_cudause_cpuuse_mps_deviceseed	data_seedjit_mode_evaluse_ipexbf16fp16fp16_opt_levelhalf_precision_backendbf16_full_evalfp16_full_evaltf32
local_rankddp_backendtpu_num_corestpu_metrics_debugdebugdataloader_drop_last
eval_stepsdataloader_num_workersdataloader_prefetch_factor
past_indexrun_namedisable_tqdmremove_unused_columnslabel_namesload_best_model_at_endmetric_for_best_modelgreater_is_betterignore_data_skipfsdpfsdp_min_num_paramsfsdp_config"fsdp_transformer_layer_cls_to_wrapaccelerator_configparallelism_config	deepspeedlabel_smoothing_factoroptim
optim_args	adafactorgroup_by_lengthlength_column_name	report_toddp_find_unused_parametersddp_bucket_cap_mbddp_broadcast_buffersdataloader_pin_memorydataloader_persistent_workersskip_memory_metricsuse_legacy_prediction_looppush_to_hubresume_from_checkpointhub_model_idhub_strategy	hub_tokenhub_private_repohub_always_pushhub_revisiongradient_checkpointinggradient_checkpointing_kwargsinclude_inputs_for_metricseval_do_concat_batchesfp16_backendpush_to_hub_model_idpush_to_hub_organizationpush_to_hub_tokenmp_parametersauto_find_batch_sizefull_determinismtorchdynamo	ray_scopeddp_timeouttorch_compiletorch_compile_backendtorch_compile_modeinclude_tokens_per_secondinclude_num_input_tokens_seenneftune_noise_alphaoptim_target_modulesbatch_eval_metricseval_on_startuse_liger_kernelliger_kernel_configeval_use_gather_objectaverage_tokens_across_devicesmodel_init_kwargs
max_lengthtruncation_modeoptimize_device_cache )printsuper__init__r   r   r   )r7   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r9   	__class__s                                                                                                                                             r<   r   "UnslothIterativeSFTConfig.__init__   s   X 4)I-  YB  (C  "D1e&F}o  Vw  %x  y-7":zS?P7J M E	D#E	D#7E	D  E	D 	E	D
 $E	D *E	D $8E	D +FE	D *DE	D (@E	D '>E	D +FE	D '>E	D $E	D '>E	D  *!E	D" (#E	D$ $%E	D& $'E	D( ()E	D* *+E	D,  0-E	D. "/E	D0 !21E	D2 (3E	D4 (5E	D6 "7E	D8 !29E	D:  0;E	D< &=E	D>  0?E	D@ "4AE	DB *CE	DD &<EE	DF *GE	DH $IE	DJ  0KE	DL  0ME	DN !2OE	DP .QE	DR 7^SE	DT UE	DV WE	DX ,YE	DZ [E	D\ "]E	D^ *_E	D`  aE	Db cE	Dd eE	Df ,gE	Dh &<iE	Dj ,kE	Dl ,mE	Dn oE	Dp $qE	Dr &sE	Dt *uE	Dv !2wE	Dx yE	Dz $8{E	D| $}E	D~ &<E	D@ *DAE	DB $CE	DD  EE	DF (GE	DH %:IE	DJ &KE	DL &<ME	DN %:OE	DP !2QE	DR  0SE	DT UE	DV #6WE	DX &YE	DZ 2T[E	D\ "4]E	D^ "4_E	D` "aE	Db &<cE	Dd eE	Df $gE	Dh "iE	Dj .kE	Dl "4mE	Dn "oE	Dp *DqE	Dr !2sE	Dt %:uE	Dv %:wE	Dx -JyE	Dz #6{E	D| *D}E	D~ &E	D@ &<AE	DB (CE	DD (EE	DF "GE	DH  0IE	DJ .KE	DL (ME	DN &<OE	DP -JQE	DR *DSE	DT &<UE	DV (WE	DX $8YE	DZ (@[E	D\ !2]E	D^ *_E	D` $8aE	Db  0cE	Dd &eE	Df "gE	Dh &iE	Dj *kE	Dl %:mE	Dn "4oE	Dp )BqE	Dr -JsE	Dt #6uE	Dv $8wE	Dx "4yE	Dz *{E	D|  0}E	D~ #6E	D@ &<AE	DB -JCE	DD !2EE	DF $GE	DH .IE	DJ %:FKE	DL %9!"4,r?   )r   r   r   )NNFFFr   FrK   rK   NN   r#  r      g-C6
?g{Gz?g?g+?g:0yE>g      ?g      @rJ   linear皙?r   passivewarningTNr   FrP   Fr   r   NTFFFFFFO  r)  FFFFO1autoFFNrJ   NNF FNr   NrJ   NNTNFNNFr,  r   NNNNN        
adamw_8bitNFFlengthNNNNTFTFFNN
every_saveNNFNTNFTr+  NNNr,  FFNlasti  FNNFFNNFFFNFTNNkeep_endFNrJ   N)__name__
__module____qualname____firstlineno____doc__r,   r   r   r   __annotations__r   intr   r   __static_attributes____classcell__r!  s   @r<   r   r      s   "F +012+(3-  */VW*#  &+EF&NXc]  #$&'%&#'"&&'"#"%$%""!&!27!'!$!"%) $!& $  -1!!!$%%)  $ $(-"%*!%#!%(,%*!%##' $  $!$)(-"#" "!&(, $ %#SZ- Z-r?   r   c                     ^  \ rS rSrSrSS/r       S!S\\\4   S\	\\
\4      S\	\   S	\	\\\\\4   4      S
\	\\\\\4      S\\R*                  R,                  \R*                  R.                  R0                  4   S\	\\R4                  \R4                  /\R4                  4      S\	\\/\4      4U 4S jjjrS\S\
S\4S jrS\R4                  S\R4                  S\R4                  4S jr\S\ \RB                     S\ \RB                     S\ \RB                     S\ \   S\ \   4
S j5       r"\#RH                  " 5            S"S\	\ \RB                        S\	\ \RB                        S\	\ \RB                        S\	\ \      S\	\ \      4
S jj5       r%S r&U 4S jr'   S#S\	\   S\	\   S\\\ \   S4   4S jjr(S r)U =r*$ )$_UnslothIterativeSFTTraineri  a	  
The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization.

<Tip warning={true}>

The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].

</Tip>

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    args ([`IterativeSFTConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    data_collator (`DataCollator`, *optional*):
        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
        Will default to [`~transformers.default_data_collator`] if no `processing_class` is provided, an instance
        of [`~transformers.DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or
        tokenizer.
    eval_dataset (`datasets.Dataset`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
        with [`~transformers.AutoTokenizer.from_pretrained`].
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.
trlziterative-sftNr3   r8   data_collatoreval_datasetprocessing_class
optimizerspreprocess_logits_for_metricscompute_metricsc	                 ~  > [         R                  " S[        5        [        U[        5      (       a  UOUR
                  R                  n	Uc#  U	R                  S5      S   n
[        U
 S35      nOe[        U[        5      (       aP  [        U[        5      (       d;  UR                  5       nUR                  US'   UR                  S5        [        S0 UD6nUc  [        R                  " U	5      nUR                  b+  [        U[        5      (       d  [         R"                  " S5        [        U[        5      (       a  U R%                  X5      n['        5       (       a  [        U[(        5      (       a  SU l        OS	U l        XPl        [/        UR
                  S
S	5      U l        Uc<  U R0                  (       a  [3        USSS9U l        O [7        U R,                  S	S9U l        OX0l        UR8                  U l        UR:                  U l        UR<                  U l        [>        TU ]  UUU R4                  UUUUUS9  [C        U RD                  S5      (       a%  U RD                  RG                  U RH                  5        U RK                  U RL                  RN                  5        U RP                  RS                  U RD                  U RT                  U RV                  5      u  U l"        U l*        U l+        U R:                  S:X  a  SOSU R,                  l,        [C        U S5      (       d  [[        S5      eU R<                  [\        l        g )NzkThe `IterativeSFTTrainer` is deprecated and will be removed in version 0.24.0. Please use the `SFTTrainer`./rJ   z-IterativeSFTr   r  zYou passed model_init_kwargs to the `IterativeSFTConfig`, but your model is already instantiated. The `model_init_kwargs` will be ignored.TFis_encoder_decoder   )label_pad_token_idpad_to_multiple_of)mlm)r3   r8   r@  rA  rB  rE  rC  rD  add_model_tagsr2  leftrightacceleratorzXYour `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`.r  )/r)   warnFutureWarning
isinstancestrconfig_name_or_pathsplitr   r    to_dictr   popr   from_pretrainedr  r%   r(  _create_model_from_pathr#   r   is_peft_modelrB  getattrrH  r   r@  r   r  r  r  r  r   r6   r3   rN  
_tag_namescreate_optimizer_and_schedulerr8   r   rQ  prepare	optimizerlr_schedulertruncation_sideAttributeErrorr   )r7   r3   r8   r@  rA  rB  rC  rD  rE  model_id
model_name	dict_argsr!  s               r<   r   $_UnslothIterativeSFTTrainer.__init__  s     		
 'uc2258R8R<!,R0J%M&BCD/00DJ\9]9]I%)^^Ik"MM-.%2	2D #,<<XF !!-j6L6LNN; eS!!00=E :eY#?#?!%D!&D 0")%,,8Le"T &&%;$RS&" &ETEZEZ`e%f"!.//#33%)%?%?",,%-+!*G 	 		
 4::/00JJ%%doo6++DII,?,?@ 9=8H8H8P8PJJ(9(99
5
DND$5 ;?:N:NR\:\bi-t]++ j  /3.H.H+r?   
model_pathrj   c                 Z    UR                   =(       d    0 n[        R                  " U40 UD6$ )z0Creates a model from a path or model identifier.)r  r   r[  )r7   rj  r8   r  s       r<   r\  3_UnslothIterativeSFTTrainer._create_model_from_pathk  s*     228b#33JTBSTTr?   rg   attention_masklabelsc                    Uc&  U Vs/ s H  n[         R                  " U5      PM     nnU R                  (       a  U R                  [	        XU5       VVVs/ s H  u  pEnXEUS.PM     snnn5      R                  U R                  R                  5      nUR                  SS 5        SUS   US   U R                  R                  :H  '   OTU R                  [	        X5       VVs/ s H	  u  pEXES.PM     snn5      R                  U R                  R                  5      nU R                  b  U R                  S:X  a3  UR                  5        VV	s0 s H  u  pXS U R                   _M     nnn	U$ U R                  S:X  a4  UR                  5        VV	s0 s H  u  pXU R                  * S  _M     nnn	U$ [        SU R                   35      eU$ s  snf s  snnnf s  snnf s  sn	nf s  sn	nf )	Nrg   rm  rn  decoder_input_idsrI  rn  )rg   rm  
keep_startr2  zUnknown truncation mode: )r(   	ones_likerH  r@  rT   rU   r3   rv   rZ  rB  ri   r  r  itemsrl   )
r7   rg   rm  rn  idsattlab
input_datakvs
             r<   prepare_model_inputs0_UnslothIterativeSFTTrainer.prepare_model_inputsp  s   !>GHiseooc2iNH""++ *-Y)O)O# #&L)O
 b""#  NN.5_cJx H!59N9N9[9[![\ ++KNyKijKixss:Kijb""# 
 ??&##|3BLBRBRBTUBT$!a#4T__!55BT
U  %%3CMCSCSCUVCU41aDOO#3#5!66CU
V  !#<T=Q=Q<R!STT9 I k VVs    GG"G
GG#textstexts_labelsc           
         UGc  Uc  [        SS/X/5       Hr  u  pV[        U[        5      (       d  [        U S[	        U5       35      e[        US   [
        R                  5      (       a  MW  [        SU S[	        US   5       35      e   GOF[        / SQXU/5       Hr  u  pV[        U[        5      (       d  [        U S[	        U5       35      e[        US   [
        R                  5      (       a  MW  [        SU S[	        US   5       35      e   O[        U[        5      (       d  [        S[	        U5       35      e[        US   [        5      (       d  [        S	[	        US   5       35      eUb^  [        U[        5      (       d  [        S
[	        U5       35      e[        US   [        5      (       d  [        S[	        US   5       35      eXX#U4$ )a  
Check if the input data is valid for training.

Args:
    input_ids (list[`torch.LongTensor`]):
        List of tensors containing the input_ids
    attention_mask (list[`torch.LongTensor`]):
        List of tensors containing the attention_mask
    labels (list[`torch.FloatTensor`]):
        List of tensors containing the labels
    texts (list[`str`]):
        List of string containing the text input.
    texts_labels (list[`str`]):
        List of string containing the text labels.

Returns:
    `tuple`: The input data.
rg   rn  z! must be a list of tensors - got r   zElements in z must be tensors - got rp  z''text' must be a list of strings - got z)Elements in 'text' must be strings - got z.'text_labels' must be a list of strings - got z0Elements in 'text_labels' must be strings - got )rT   rT  listrl   typer(   r   rU  )rg   rm  rn  r}  r~  nametensor_lists          r<   _step_safety_checker0_UnslothIterativeSFTTrainer._step_safety_checker  s   4 =%),k8-DyFY)Z%D%k488(D61RSWXcSdRe)fgg%k!nellCC(<v=TUYZefgZhUiTj)kll	 *[ *-=	[a?b*%D &k488(D61RSWXcSdRe)fgg%k!nellCC(<v=TUYZefgZhUiTj)kll* eT** #J4PU;-!XYYeAh,, #LTRWXYRZ^L\!]^^'!,55$'UVZ[gVhUi%jkk!,q/377$'WX\]ijk]lXmWn%opp&EEr?   c                   ^  T R                   R                  5         T R                  R                  S:X  aY  [        R
                  " S5      R                  T R                  R                  5      T l	        T R                  R                  T l
        Uc  Uc  [        S5      eUb  Ub  [        R                  " S5        Uc  Uc  T R                  (       a  [        S5      eUb  USS OSnUb  USS OSnUb  USS OSnUb  USS OSnUb  USS OSnT R                  XX4U5      u  pp4nUb&  T R!                  UT R"                  SSSS	9nUS
   US   p!Ub   T R!                  UT R"                  SSSS	9S
   nUc  UnT R%                  XU5      n['        UR)                  5       5      n0 nUR+                  U5        U 4S jn	[,        R.                  " U5      n
U
R1                  S5        [3        U
T R                  R4                  SU	S9n[7        U5       GH  u  pT R8                  R;                  T R                   5         U Vs0 s H  oX   _M	     nnT R=                  T R                   U5      nT R                  R>                  S:  a  URA                  5       nURC                  5       nT R8                  RE                  U5        T R8                  RF                  (       a_  T R                  RH                  bH  T R8                  RK                  T R                   RM                  5       T R                  RH                  5        T RN                  RQ                  5         T RN                  RS                  5         T RT                  b  T RT                  RQ                  5         T R                  =R                  S-  sl        T =R                  U-  sl	        T RW                  5         SSS5        GM     gs  snf ! , (       d  f       GM  = f)aP  
Run an optimisation step given a list of input_ids, attention_mask, and labels or a list of text and
text_labels.

Args:
    input_ids (list[`torch.LongTensor`]):
        List of tensors containing the input_ids (if not provided, text will be used)
    attention_mask (list[`torch.LongTensor`], , *optional*):
        List of tensors containing the attention_mask
    labels (list[`torch.FloatTensor`], *optional*):
        List of tensors containing the labels (if set to None, will default to input_ids)
    texts (list[`str`], *optional*):
        List of strings containing the text input (if not provided, input_ids will directly be used)
    texts_labels (list[`str`], *optional*):
        List of strings containing the text labels (if set to None, will default to text)

Returns:
    `dict[str, Any]`: A summary of the training statistics
r   r-  Nz@Step should include `input_ids` or `texts` as keyword arguments.ztBoth `input_ids` and `texts` argument are provided. `input_ids` will be ignored. Please provide only one of the two.zNo 'labels' or 'text_labels' are provided. When using an encoder-decoder architecture, 'labels' or 'text_labels' must be passed.Tpt)r  
truncationpaddingreturn_tensorsrg   rm  c                    > [        5       nU S    HY  nUS;   d  M  [        R                  " U  Vs/ s H  o3U   PM	     sn5      R                  TR                  R
                  5      X'   M[     U$ s  snf )Nr   rp  )dictr(   stackrU   r3   rv   )datareturn_dictkeydr7   s       r<   collator2_UnslothIterativeSFTTrainer.step.<locals>.collator  sf    &KAwCC',{{D3IDqcFD3I'J'M'MdjjN_N_'`K$   4Js   A0
r(   )rx   shuffle
collate_fnrP   ),r3   trainstateglobal_stepr(   r   rU   r8   rv   tr_loss_globalstep_last_loggedrl   r%   r(  rH  r  rB  r  r{  r  keysupdater   	from_dict
set_formatr   r   	enumeraterQ  
accumulatecompute_lossn_gpumeandetachbackwardsync_gradientsr   clip_grad_norm_
parametersrb  step	zero_gradrc  _maybe_log_save_evaluate)r7   rg   rm  rn  r}  r~  model_inputsmodel_inputs_names
batch_dictr  
batch_datastep_dataloader_batchry  losstr_loss_steps   `                r<   r   _UnslothIterativeSFTTrainer.step  s   8 	

::!!Q& <<,//		0@0@ADL+/::+A+AD(_``"u'8NN6
 >l2t7N7N S 
 %.$9IaLt	.<.H*d$0d!-a4*6*B|AAEAZAZvlB
>	6, 00$//dDae 1 L )5[(A<P`Ca~#**$//dDae + F >F00FS!,"3"3"56
,'	 &&z2
g&$yy<<	
 "/2HA!!,,TZZ85GH5G585GH((\B99??Q&99;D#{{}  ))$/##22tyy7N7N7Z$$44

--/		//
 ##%((*$$0%%**,

&&!+& ,--/7 98 3H 98s   9O+>O&FO+&O++
O;	c                 \   U R                   R                  bf  U R                  R                  U R                   R                  -  S:X  a5  U R                  R                  S:w  a  U R	                  U R
                  5        U R                   R                  Gb  U R                  R                  U R                   R                  -  S:X  a  U R                  R                  S:w  a  0 nU R                  U R                  5      R                  5       R                  5       nU =R                  U R                  -  sl        [        X R                  R                  U R                  -
  -  S5      US'   U R                  5       US'   U R                  R                  U l        U R                  U5        g g g g )Nr   rK   r  r   )r8   r   r  r  evaluaterA  r   _nested_gatherr  r  itemroundr  _get_learning_ratelog)r7   logstr_loss_scalars      r<   r  4_UnslothIterativeSFTTrainer._maybe_log_save_evaluateE  sD   99+zz%%		(<(<<AdjjF\F\`aFad//0 99"".zz%%		(?(??1DI_I_cdId)+!%!4!4T\\!B!G!G!I!N!N!P ,$^zz7M7MPTPlPl7l%mopqV(,(?(?(A_%/3zz/E/E, JeD /r?   c                   > U R                   R                  c*  [        U R                   R                  5      R                  nO(U R                   R                  R                  S5      S   nU R                  US9  [        TU ]!  X5        g )NrG  rJ   )rg  )	r8   r   r   r   r  rX  create_model_cardr  _save_checkpoint)r7   r3   trialrg  r!  s       r<   r  ,_UnslothIterativeSFTTrainer._save_checkpoint]  sj    99!!)dii22388J//55c:2>J*5 .r?   rg  dataset_nametagsc                    U R                  5       (       d  g[        U R                  R                  S5      (       ac  [        R
                  R                  U R                  R                  R                  5      (       d!  U R                  R                  R                  nOSnUc  [        5       nO$[        U[        5      (       a  U1nO[        U5      n[        U R                  R                  S5      (       a  UR                  S5        S[        R                  ;   a  UR                  S5        UR                  U R                  5        [        UUU R                   UU[#        5       (       a+  [$        R&                  b  [$        R&                  R(                  OS[+        5       SS9nUR-                  [        R
                  R/                  U R0                  R2                  S	5      5        g)
a  
Creates a draft of a model card using the information available to the `Trainer`.

Args:
    model_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the model.
    dataset_name (`str` or `None`, *optional*, defaults to `None`):
        Name of the dataset used for training.
    tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
        Tags to be associated with the model card.
NrW  unsloth_versionunslothJOB_IDhf_jobszIterative SFT)
base_modelrg  r   r  r  	wandb_url	comet_urltrainer_namez	README.md)is_world_process_zeror6   r3   rV  r'   pathisdirrW  setrT  rU  addenvironr  r_  r!   r   r$   wandbrunurlr"   savejoinr8   r   )r7   rg  r  r  r  
model_cards         r<   r  -_UnslothIterativeSFTTrainer.create_model_carde  sN   " ))++4::$$o66rww}}TZZM^M^MlMl?m?m**88JJ <5Dc""6Dt9D4::$$&788HHYrzz!HHYDOO$(!!**%'9';';		@Ueiimm[_.0(	

 	TYY%9%9;GHr?   )r  r@  rH  r]  rc  r  r3   r  rb  rB  r  r  )NNNN)NNNN)NNNNN)NNN)+r3  r4  r5  r6  r7  r_  r	   rU  r   r   r   r    r   r   r  r   r   r   r   tupler(   r   	Optimizerrc  LambdaLRr   r   r   r   r\  r{  staticmethodr  
LongTensorr  r   empty_device_cacher  r  r  r  r:  r;  r<  s   @r<   r>  r>    s   &P )J
 HL04EI W
 imFJbIS/)*bI u/1BBCDbI  -	bI
 uWd3<.@%@ABbI #)+=?UWeef
bI %++//1I1I1R1RRSbI (0%,,9UW\WcWc9c0d'ebI "(N+;T+A"BCbI bIHU# U=O UTc U
ell ELL bgbnbn @ 3F(()3FU--.3F U%%&3F Cy	3F
 3i3F 3Fj %%' 7;;?37%),0|0D!1!123|0 !e&6&6!78|0 e../0	|0
 S	"|0 tCy)|0 (|0|0/ %)&*,0	4ISM4I sm4I CcD()	4I 4Ir?   r>  c                   <   ^  \ rS rSrSr      SU 4S jjrSrU =r$ )UnslothIterativeSFTTraineri  a	  
    
The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization.

<Tip warning={true}>

The [`IterativeSFTTrainer`] is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].

</Tip>

Args:
    model (`Union[str, PreTrainedModel]`):
        Model to be trained. Can be either:

        - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
          path to a *directory* containing model weights saved using
          [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
          using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
          `args.model_init_kwargs`.
        - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
    args ([`IterativeSFTConfig`], *optional*, defaults to `None`):
        Configuration for this trainer. If `None`, a default configuration is used.
    data_collator (`DataCollator`, *optional*):
        Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`.
        Will default to [`~transformers.default_data_collator`] if no `processing_class` is provided, an instance
        of [`~transformers.DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or
        tokenizer.
    eval_dataset (`datasets.Dataset`):
        The dataset to use for evaluation.
    processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
        Processing class used to process the data. If `None`, the processing class is loaded from the model's name
        with [`~transformers.AutoTokenizer.from_pretrained`].
    optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
        The optimizer and scheduler to use for training.
    preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
        The function to use to preprocess the logits before computing the metrics.
    compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
        The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
        metric values.

    c                 8  > Uc
  [        5       n[        USS5      n	[        U	5      [        La  Sn	[        USS5      n
[        U
5      [        La  Sn
Sn[        R
                  R                  SS5      S:H  nU(       d1  [        R
                  R                  SS5      S:X  a  [        S5        S	n[        R
                  R                  S
S5      n[        UR                  SS 5      =(       d    [        UR                  SS 5      nUc  UR                  5       R                  nSSKJn  U" U5      nU[        R                  :H  nU(       d  U(       a  U	(       a  [        S5      eU(       d  U(       d  U
(       a  [        S5      eU(       a"  SUl        SUl        S[        R
                  S'   OCU	(       d<  U
(       d5  US:X  a/  UUl        U(       + Ul        U(       a  SOS[        R
                  S'   [        USS 5      b-  [        USS5      S:X  a  SUl        [        USS 5      c  SUl        [        USS 5      nUb/  US:  a)  SSKJn  [-        U5      [-        S5      ::  a  [        S5        [        USS5      S:w  aL  [        USS5      nUS:X  a!  UR.                  U:  a  UR.                  Ul        [        US S 5      c
  Ub  UUl        [        US!S5      n[        U5      [        La  Sn[        US"S5      n[        U5      [        La  SnUR                   (       a  U(       a  SUl        S	Ul        UR"                  (       a  U(       a  S	Ul        SUl        U(       a  SUl        SUl        Oc[        R
                  R                  S
S5      S#:X  a  S	Ul        SUl        O0U(       d)  U(       d"  UR"                  Ul        UR                   Ul        Sn[9        5       R                  S$S 5      b  S	n[9        5       R                  S%S 5      b  S	nU(       a  S[        R
                  S&'   S'[9        5       ;  a  [;        US'5      (       d  OD[        US'S 5      n[        US'S 5      nUc'  Ub$  UR<                  n[;        US'5      (       a  UUl        Ub!  [;        US(5      (       a  UR?                  5         S)[9        5       ;   a   [;        [@        S*5      (       a  S+[@        l!        S,[9        5       ;   aU  [;        US*5      (       a  S+Ul!        [;        US)5      (       a,  [;        UR@                  S*5      (       a  S+UR@                  l!        / nSS-K"J#n  U" S.U5        [        US/S 5      [H        RJ                  :X  a(  URL                  S:  a  [        US0S5      S:w  a  SUl'        S1[9        5       ;   a!  [;        US(5      (       a  UR?                  5         [P        TU ]  " S8UUUUUUUS2.UD6  S1[9        5       ;   a!  [;        US35      (       a  URU                  5         [;        U S45      (       a-  U RV                  RY                  5         [;        U S45      (       a  U ?+[        US5S 5      b  U RZ                  UR                  5       l-         [;        U S65      (       aV  U R\                  R^                  nUn[;        US15      (       a&  UUl0        URb                  n[;        US15      (       a  M&  UUl0         [;        U S75      (       a.  [e        [g        U Rh                  Rj                  5      U 5      U l5        g )9Nr   Fr   UNSLOTH_ENABLE_FULL_FINETUNING01UNSLOTH_FORCE_FLOAT32zKUnsloth: Switching to float32 training since model cannot work with float16TUNSLOTH_MIXED_PRECISIONrV   dtypetorch_dtyper   )
_get_dtypezuUnsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`zuUnsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`r   ACCELERATE_MIXED_PRECISIONrA  r   r   r   r&  r   rP   )__version__z4.45.2z**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`r   rJ  r   r   r   bfloat16rE  rD  UNSLOTH_RETURN_LOGITSr   r4   	tokenizerpadding_siderP  rB  )PatchRLStatisticsiterative_sft_trainerparallel_mode_n_gpur3   )r3   r8   r@  rA  rB  rD  rE  r5   neftune_hook_handler  rQ  r  r  )6r   r^  r  boolr'   r  getr  rV  get_input_embeddingsr  unsloth_zoo.utilsr  r(   float16	TypeErrorr   r   r   r   transformersr  r-   r   r   r   r   r   localsr6   r   r4   r  r  unsloth_zoo.logging_utilsr  r/   NOT_DISTRIBUTEDr  r  r  r   r5   r  remover  rQ  scaleraccelerator_scalerr3   r0   rB   r!  r  )r7   r3   r8   r@  rA  rB  rD  rE  r9   use_bf16use_fp16force_float32full_finetuningmixed_precision_dtyper  r  r  ga_stepstransformers_versioneval_bszr   r   _output_logitsmodel_max_seq_lengthargs_max_seq_lengthr   other_metricsr  r  current_modelr!  s                                 r<   r   #UnslothIterativeSFTTrainer.__init__  sB    < 9 ;4/>%%x4/>%%x**..)I3OSVVBJJNN3JC$PTW$W_` M "

/H) Tgt4bm]a8b=%"<"<">"D"D%05!5==('hy  JA  @B  :Bg(9  NE  DF  >FDIDI7;BJJ3481F)1SDI#DIAHvfBJJ344.:wt_^b?cgk?k!(Dt\408C$/4!>EHqLH+,0AA @ A4$/47t%A1EH1}!A!AH!Lpt  qQ  qQdNmt6=E(J^  @H`d`| '7?t+e^ '7?t+e^99u)<\`dFY99t)<[`TEX"'D"'DZZ^^5yAZO"&D"'D"&))D"&))D8<<)40<tn8<<7>J]aN25BJJ./68+GDBR4S4S#*52BD#I #*42BD#I"*/C/O!&!5!54!122.D4G!?!? &("wy.'I'Idk9Ka)'88Za:J:W'55'BRB\B\^l:m:m  Zao  pJ  pJ  pW?1=A 4$/<3O3OOTXT^T^abTbtXq)Q.fh75.#A#A  	8)'/,I-	8 17	8 fh75/#B#B!4.//$$++-t2339Q4.5A?C?W?WE&&(<4''%%,,F!M-11390 - 3 3 -11 06M,4!!#$=dnn>R>R$SUYZDJr?   )r  )NNNNNN)r3  r4  r5  r6  r7  r   r:  r;  r<  s   @r<   r  r    s)    (X (,| |r?   r  	addFilterc                        \ rS rSrS rS rSrg)HideLoggingMessageiG  c                     Xl         g Ntext)r7   r  s     r<   r   HideLoggingMessage.__init__H  s    d)r?   c                 <    U R                   UR                  5       ;  $ r  )r  
getMessage)r7   xs     r<   filterHideLoggingMessage.filterI  s    alln)DEr?   r  N)r3  r4  r5  r6  r   r  r:  r  r?   r<   r  r  G  s    2Er?   r  z`use_cache=True`)Lr7  r(   r   torch.nnnnr   Ftypingr   r   r   r   r	   r
   r   r   !trl.trainer.iterative_sft_trainerr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r)   dataclassesr+   r,   packaging.versionr-   numpynp
contextlibr.   r  +TransformersDataCollatorForLanguageModelingtransformers.training_argsr/   r@   typesr0   rB   torch_compile_optionscompilerf   r9  rq   r   r   r   r>  r  r6   Filterr  r  r  r?   r<   <module>r1     s  0    $ I I I w	  w	  w	  w	  w	  w	  w	  w	  w	  w	  w	 
  ( %   " $  3      4;PR S"||  \\	&,, %  	
 \\6ell C ELL  J- 2 J- J-V
 }I' }I|f!< fP  6;FW^^ F 	
'(:;<  r?   