
    oiU                        S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	J
r  S SKJr  S SKrS SKJrJrJrJrJr  SS	KJrJr  SS
KJr  S SKrS SKr/ SQr\ R:                  S 5       r \ R>                          SS\S\S\\    S\\    S\\    S\\    S\\    S\\    S\\    S\4S jj5       r! S r" S r# S r$ S SK%J&r&  \& " S S5      5       r' S r(g)     N)set_seed)get_scheduler)Trainer)seed_worker)tqdm)AnyOptionalListDictTuple   )
_get_dtypeVersion)dtype_from_config)fix_zero_training_lossunsloth_trainprepare_model_for_trainingc                    [        U[        R                  5      (       a  g[        U5      S:X  a  gUS   n[	        U5      [
        L a  SU;   a  SnSn[        U5       HG  u  pc[        [        US   5      5      n[        U5      S:X  a  US   S:X  a  US-  nOUS-  nUS:  d  MG    O    US:X  a  US:X  a  gXDU-   -  S:X  a  [        S5      eXDU-   -  S:  a  [        S	5        g!    M  = f)
z\
Sometimes the labels get masked by all -100s, causing the loss
to be 0. We check for this!
Nr   labelsr   d   ax  Unsloth: All labels in your dataset are -100. Training losses will be all 0.
For example, are you sure you used `train_on_responses_only` correctly?
Or did you mask our tokens incorrectly? Maybe this is intended?
Maybe you're using a Llama chat template on a non Llama model for example?If you used `train_on_responses_only`, confirm your user and assistant parts are correct!g?a  Unsloth: Nearly all labels in your dataset are -100. Training losses will be all 0.
For example, are you sure you used `train_on_responses_only` correctly?
Or did you mask our tokens incorrectly? Maybe this is intended?
Maybe you're using a Llama chat template on a non Llama model for example?If you used `train_on_responses_only`, confirm your user and assistant parts are correct!)
isinstancedatasetsIterableDatasetlentypedict	enumeratelistsetZeroDivisionErrorprint)model	tokenizertrain_datasetrowseen_bad	seen_goodicheck_tokenss           T/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/training_utils.pyr   r   '   s    -!9!9:: 	
=Q 
CCyDX_ 	.FA#'CM(:#;L< A%,q/T*A8q=8q.)Cx / 	 q=Y!^VI-.!3#l  I-.#5l 	3 Hs   "C##C(r#   use_gradient_checkpointinguse_reentrantfull_finetuningtrain_layernormstrain_embeddingtrain_lm_headfloat32_mixed_precisionpatch_modules_to_savereturnc	           
         US;   d   e[        U5      [        L d   e[        U5      [        L d   e[        U5      [        L d   e[        U5      [        L d   e[        U5      [        L d   e[        U5      [        L d   e[        [        U R                  5      5      n	[
        R                  n
U	[
        R                  :X  aP  [
        R                  n
S[        R                  S'   U(       a%  Xl
        [
        R                  U R                  l        OU	[
        R                  :X  aW  U(       aP  [
        R                  n
S[        R                  S'   U(       a%  Xl
        [
        R                  U R                  l        O[U	[
        R                  :X  a$  [
        R                  n
S[        R                  S'   O#[
        R                  n
S[        R                  S'    U R                  5        GH  u  pSnSnU(       d  SU;   d  SU;   d  SU;   a  S	nS	nOGSnODU(       a  S
U;   d  SU;   a  S	nS	nU(       a  SU;   d  SU;   a  S	nSnU(       a  SU;   a  S	nSnOS	nSn U(       a  UR                  S	5        OUR                  S5        U(       a  UR                  SSS5      n[         R"                  " SU5      b2  [         R$                  " SSU5      n[         R"                  " SU5      b  M2  UR                  SSS5      nU(       a  [
        R                  OU
n	 ['        U S[)        U	5       S35         S
U;   d	  SU;   d  GMU  [        R                  R+                  SS5      S:X  d  GM|   UR                  SSS5      n[         R"                  " SU5      b2  [         R$                  " SSU5      n[         R"                  " SU5      b  M2  UR                  SSS5      n['        U S[)        [
        R                  5       S35        GM      U n[-        US5      (       aS  US:X  a  S	Ul        US	:X  a!  [-        US5      (       a  UR1                  5         UR2                  n[-        US5      (       a  MS   US:X  a  S	Ul        US	:X  a!  [-        US5      (       a  UR1                  5         [-        U S5      (       aH  US;   a  U R5                  5         O1U R7                  5        H  n[-        US 5      (       d  M  SUl        M     U(       aD  [-        U S!5      (       a  U R;                  5         O"S" nU R=                  5       R?                  U5         U(       Ga.   S#S$K J!n  U RE                  5        GH  u  nn[        U5      UL d  S%U;   a  [G        US&S 5      b  URH                  R                  S5        [G        US'S 5      b  URJ                  R7                  5        H  n[-        US(5      (       d  M  URL                  RN                  [
        R                  :X  aA  [Q        S)U S*35        URS                  [
        R                  5        UR                  S	5        M  [Q        S+U S,35        UR                  S	5        M       GM       U $ !   ['        SU S[)        U	5       S35         GNo= f!   ['        SU S[)        [
        R                  5       S35         GM  = f!   S n GN= f)-N)TFunslothfloat32UNSLOTH_MIXED_PRECISIONbfloat16Fz.lora_A.z.lora_B.z.lora_magnitude_vectorTznorm.
_layernormembed_tokens	embeddinglm_head
base_modelr#   r   z	\.(\d+)\.z[\1].z.weight z.to()zmodel.UNSLOTH_UPCAST_LAYERNORM01r6   gradient_checkpointing_enable_set_gradient_checkpointing)Tr6   gradient_checkpointingenable_input_require_gradsc                 &    UR                  S5        g )NT)requires_grad_)moduleinputoutputs      r+   make_inputs_require_grad<prepare_model_for_training.<locals>.make_inputs_require_grad   s    %%d+    r   )ModulesToSaveWrapperModulesToSaveoriginal_modulemodules_to_saveweightzUnsloth: Upcasting `zS` from float16 to float32 since it's in `modules_to_save`. Also allowing gradients.z!Unsloth: Allowing gradients for `z"` since it's in `modules_to_save`.)*r   boolr   r   configtorchr7   float16osenviron_unsloth_original_dtypetorch_dtyper9   named_parametersrI   replaceresearchsubexecstrgethasattr!_offloaded_gradient_checkpointingrD   r#   rE   modulesrF   rG   get_input_embeddingsregister_forward_hook
peft.utilsrP   named_modulesgetattrrR   rS   rT   dtyper"   to)r#   r,   r-   r.   r/   r0   r1   r2   r3   rm   mixed_precision_dtypenameparamupcastrequires_gradmrJ   rM   rP   saved_modules                       r+   r   r   [   s    &)BBCB$&'& D()( !T)*) D()($&'&'(D010(67E!MM %09

,- ,1)',}}ELL$	%..	 %< %09

,-,1)',}}ELL$	%..	  %0:

,- %09

,---/T!Z4%7;SW[;[ $ %W_8L $Nd$:kT>Q $)t"3 $ $  &  ' <<gq9D))L$/;vvlHd; ))L$/;<<	2q1D%+EMM1FE7vT#e*Q/0 	tO|t3Hbdg9hlo9o	?||L'1=iid3?66,$?D iid3?||Ir15vT#emm"4!5Q78g 0n 	 	A
!W

%226A/%-'!=\2]2]++-GG !W

 	!Y..2+!T)ga9X.Y.Y	'') u344%)::--/  --/6#;<<49F1 *
 5677,,.,&&(>>?WX 	(7 "//1LD&F|33$7N6#4d;G**99%@6#4d;G(.(>(>(F(F(H"<::+2288EMMI %(<TF  CV  'W  !X , > , ; ;D A %(I$Oq&r s , ; ;D A )I ! 2" 	L_7vdV4E
|156?vdV4EMM(:';1=>>R	(#' s+   Y>'AZ!;Z![ >Z!+[[c                 |   U R                   S:  a  [        S5      e U R                  nU R                  nX4-  nU R                  nUS:  a  XV-  n[
        R                  " Xq-  5      nORU R                  n[
        R                  " X-  5      n	[
        R                  " X-  5      n[
        R                  " U5      nXVU4$ )Nr   zOUnsloth currently does not support multi GPU setups - but we are working on it!r   )
world_sizeRuntimeErrorper_device_train_batch_sizegradient_accumulation_steps	max_stepsmathceilnum_train_epochs)
training_argsn_training_samplesr%   bszgatotal_train_batch_sizer{   total_samples_seenr~   steps_per_epochs
             r+   get_max_stepsr     s     !#lmm

3
3C

3
3B X''I1}3?99%7%LM(9999%7%PQIIo@A	99%56!.>>>rO   c                     SU l         [        U S5      (       a&  U R                  n SU l         [        U S5      (       a  M&  SU l         g )NTr#   trainingre   r#   r#   s    r+   set_trainingr     s>    EN
%
!
! %
!
! ENrO   c                     SU l         [        U S5      (       a&  U R                  n SU l         [        U S5      (       a  M&  SU l         g )NFr#   r   r   s    r+   unset_trainingr   &  s>    EN
%
!
! %
!
! ENrO   )	dataclassc                        \ rS rSr% \\S'   Srg)Trainer_Statsi1  metrics N)__name__
__module____qualname____firstlineno__r   __annotations____static_attributes__r   rO   r+   r   r   1  s    MrO   r   c                    [        U S5      (       d   e[        U S5      (       d   e[        U S5      (       d   e[        U S5      (       d   eU R                  nU R                  nU R                  n[	        U R
                  5      n[        U5        [        UR                  5        UR                  (       a  [        S5      e Uc  SSKJn  U" U R                  S	S
S9n [        R                  " U5      u  pg[!        [        R"                  " SU5      5      n/ / pSnUR%                  5        HR  u  pUR&                  (       d  M  X;   a  U	R)                  U5        OU
R)                  U5        XR+                  5       -  nMT      XR,                  S.U
SS./nUS   S   US   S   -   nU" U40 UD6n[/        X$U R
                  5      u  nnn[1        S/UR2                  UUR5                  U5      US.[7        US0 5      D6nUR8                  n[:        R<                  R>                  R@                  nURB                  nURD                  n[:        RF                  n[I        URJ                  5      nU[:        RL                  :X  as  Sn[:        RL                  n[O        U5      [O        S5      :  a)  [:        RP                  RR                  RU                  5       nO4[:        RR                  RU                  S5      nOSn[:        RV                  nSn URY                  5         [:        RF                  n[O        U5      [O        S5      :  a)  [:        RP                  RR                  R[                  US	S9nO[:        RR                  R[                  SUS	S9n Sn[:        R\                  " SS[:        R^                  S9S   n SUR`                   SUS SUS SURB                  S SURD                   SUS S US S!US 3n![c        U!5        [d        Rf                  " UU-  5      n"UU-  n#[d        Rf                  " U#U-  5      n$U#S:X  a  Un$URh                  n%[j        Rj                  " 5       n&[m        US"S#9 n'[o        U5       GH  n([        UR                  U(-   5        [q        [:        R>                  Rr                  Ru                  U R
                  U[:        R>                  Rr                  Rw                  U R
                  5      URx                  UURz                  UR                  [|        S$95      n)[o        U"5       GH(  n*U*U"S-
  :X  a  U$OUn+[o        U+5       V*s/ s H  n*[        U)5      PM     n,n*[:        R                  " U, V-s/ s H&  n-[:        R                  " U-S%   S&SS24   S':g  5      PM(     sn-5      R                  5       n.U, H  n/U/S(   R                  5       R                  SS"S)9n0U/S%   R                  5       R                  SS"S)9n1U   U" U0U1U.S*9R                  n2U U2R                  5       -  n SSS5         Uc  W2R                  5         M  UR                  W25      R                  5         M      Uc  U" UU5        UR                  5         O;UR                  U5        U" UU5        UR                  U5        UR                  5         UR                  5         URY                  5         UU%-  S:X  a<  U'R                  U S+[        U R                  5       R                  5       S
5       35         U R                  5         U'R                  S5        US-  nUU:X  d  GM)    O   GM      SSS5         [        U5        [c        S,5        [j        Rj                  " 5       n3[        S-U3U&-
  0S.9n4U4$ s  sn*f s  sn-f ! , (       d  f       GN= f! , (       d  f       Nf= f)0zu
Unsloth Trainer
1. Fixes gradient accumulation
2. Scaled down version of HF's trainer
3. Much less feature complete
argsr#   r%   data_collatorzAUnsloth: Currently `dataloader_drop_last` is not yet implemented!Nr   )DataCollatorForLanguageModelingF   )r$   mlmpad_to_multiple_of)paramsweight_decayr   r   )rp   	optimizernum_warmup_stepsnum_training_stepslr_scheduler_kwargsfp16z2.4.0cudabf16)rm   cache_enabled)device_typerm   r   zcuda:0)devicerm   z?==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = z
    \   /|    Num examples = ,z | Num Epochs = z'
O^O/ \_/ \    Batch size per device = z! | Gradient Accumulation steps = z"
\        /    Total batch size = z | Total steps = z0
 "-____-"     Number of trainable parameters = T)totaldynamic_ncols)
batch_sizesamplernum_workers
collate_fn
pin_memory	drop_lastworker_init_fnr   .r   	input_ids)r   non_blocking)r   r   n_itemsz, zUnsloth: Finished training!train_runtime)r   r   )Sre   r#   r   r   r   r%   r   transformers_set_seedseeddataloader_drop_lastNotImplementedErrortransformersr   r$   r   get_optimizer_cls_and_kwargs	frozensetget_decay_parameter_namesr]   rs   appendnumelr   r   transformers_get_schedulerlr_scheduler_typeget_warmup_stepsrl   max_grad_normrW   nnutilsclip_grad_norm_ry   rz   __version__r   rV   rX   r   r   amp
GradScalerr9   	zero_gradautocastzerosr7   rw   r"   r|   r}   logging_stepstimeProgressBarrangeiterdata
DataLoaderSequentialSamplerdataloader_num_workersdataloader_pin_memorytrainer_utils_seed_workernextstackcount_nonzerosumr   rn   lossdetachbackwardscalestepunscale_updatewriteroundcpuitemzero_r   r   )5trainerr#   r   r   r   r   optimizer_clsoptimizer_kwargsdecay_parameters	yes_decayno_decayn_parameters_to_trainrp   rq   optimizer_grouped_parameterstrainable_parametersr   r   r{   r~   lr_schedulerr   r   r   r   torch_versionconfig_dtypemixed_precisionmixed_dtypefloat16_scalerautocast_context_managerr   accumulated_loss
debug_infomax_iters_per_epochleftover_samplesleftover_gar   
start_timeprogress_barepochtrain_dataloader_iteratorj	n_batchesbatchesxr   batchr   r   r   end_timetrainer_statss5                                                        r+   r   r   6  s    7F##$#7G$$%$7O,,-,7O,,-,MMELLM))MW223-,,-))!O
 	
 	@7))!"

 	 '.&J&J=&Y#M !B!B4!OPbx--/""H#Y%5%5e%<ooe$.	 0
 	0J0JL3$ 
 	%Q'1$Q'1	2  :O>NOI 	m9N9NO 8I'7 . ..(99)D&	
 -!6
;L $11Mhhnn44O

3
3C

3
3B %%M$U\\2Lu}}$ mm=!GG$44"ZZ^^668N"YY11&9N nn %%M} 00#(::>>#:#:! $; $
 
 $)99#5#5 ! $6 $
 
 	D{{1xOPQR
I-JbJbIc d));A(>>NO_`aNb c33@3\3\]^2_  `A  BO  Bk  Bk  Al l--CA,FFWXabcWd e::OPQ9R		T  
* ))$69O$OP),BB)),s23K1Bk!//MJ	Y	=+,E "-"4"4u"<=(,U[[-=-=-H-H%%!$!&!1!1!C!CGDYDY!Z!.!E!E!.!.!D!D!.!C!C!: .I 	. 	)% ./+,1DQ1F+GKR	DI)DTUDTq4 9:DTU  ++OV'OV!E''(CG(<(DEw' 35 
 %E %k 2 = = ? B BHei B jI %h = = ? B BHei B jF1$VW^_dd(DKKM9( 2 %-(..t4==? % !)#$8-HNN$"++I6#$8-H"''	2"))+!!###%-'1, &&$r%8H8L8L8N8S8S8UWX2Y1Z'[\ &&(##A&	9$eU 0V u -v 	y 
>z 	5	
'(yy{H "_x*?T,UVMe V' 21= 
>	=sK   $C%^>	^"
^>4-^'!A^>?&^,%D*^>	^>"
^>,
^;6^>>
_)r6   TFFFFTF))rW   r|   r   r   r   r   r   r   r   transformers.trainer_utilsr   r   r   r   r   typingr   r	   r
   r   r   r   r   r   hf_utilsr   rY   r_   __all__inference_moder   no_gradrU   r   r   r   r   dataclassesr   r   r   r   rO   r+   <module>r     sd  "    : D   O $  3 3 & ' 	 	 /	 /	`   -626272727272627c!$c!)c "*$c "*$	c
 "*$c "*$c "*$c "*$c "*$c 	c cH ?,    "
   GP rO   