
    oi             5          / S Qr SSKJrJrJrJrJrJrJrJ	r	  SSK
r
SSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKrSSKJrJrJrJrJr  SSKJ r   SSK!r!SSK"r"SSK#J$r$  SSK%J&r'  SSK(r(SS	K)J)r)  SS
K*J+r+  SSK,J-r-  SSK.J/r/   \R`                  R"                  R`                  Rb                  r1Sq2S\3" 5       ;  a  \Rh                  " SS5      r5\5(       a  \5q6OSq6Sq7\" \Rp                  5      \" S5      :  r9Sr:Sr;\+S:X  a/  \Rx                  R{                  5       u  r:r;\:S:*  =(       a    \;S:  r>O\+S:X  a  Sr>O\+S:X  a  Sr> \" \!Rp                  5      \" S5      :  r?SSKr\R                  R                  S5      c  SrBO\R                  R                  SS5      S:H  rB  " S S\R                  5      rF / SQrGSrH\HS-   rI\I S 3rJ/ S!QrKS" rLS# rM SwS$ jrN SwS% jrO \(R                  " S5      S& 5       rQ S' rR S( rS S) rT S* rU S+rV    SxS, jrW       SyS-\X4S. jjrY S/rZS0r[S1r\S2R                  S3\[5      r^S4r_S5R                  S3\[5      r`S6raS7R                  S3\[5      rb\\\^4\_\`4\a\b4/rcSzS8 jrd S9 re S: rf S; rg S<S=/rhS>riS? rj S@\XSA\XS-\X4SB jrkSC rl SDrmSErnSF ro SGrpSHrqSI\SJ\XS-\\X\\X\X4   4   4SK jrr SLrsSMrtSN ru SO rv SP rw SQ rx   S{SR\R                  SS\\z   ST\\R                     SU\{4SV jjr| SW r} SwSX jr~ SwSY jr / SZQrS[/r                          S|S\\XS]\S^\S_\S`\Sa\Sb\Sc\Sd\Se\Sf\Sg\Sh\Si\Sj\Sk\Sl\Sm\Sn\So\Sp\Sq\Sr\Ss\St\Su\44Sv jjrg!   Sr1 GN= f)})UNSLOTH_COMPILE_LOCATIONget_transformers_model_typeunsloth_compile_transformerscreate_new_function    )AnyListOptionalTupleUnionDictSetCallableN   )Versionis_main_processis_distributeddistributed_functionget_lock)logger)get_lora_layer_modules)version)compiler_replacements)DEVICE_TYPE)get_torch_compile_options)r   unsloth_compiled_moduler   unsloth_compiled_cacheFz2.5.0cuda      hipxpuz3.0.0unsloth_studioUNSLOTH_STUDIO_DISABLED0c                        \ rS rSrS rS rSrg)HideLoggingMessageg   c                     Xl         g Ntext)selfr+   s     N/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/compiler.py__init__HideLoggingMessage.__init__h   s    $i    c                 <    U R                   UR                  5       ;  $ r)   )r+   
getMessage)r,   xs     r-   filterHideLoggingMessage.filteri   s    TYY!,,.%@Ar0   r*   N)__name__
__module____qualname____firstlineno__r.   r4   __static_attributes__ r0   r-   r&   r&   g   s    .Ar0   r&   )
select_best_resolutionz,original_aspect_ratio > current_aspect_ratioz%causal_mask[start:end, start:end] = 0&LAYER_PATTERN_TO_MASK_FUNCTION_MAPPINGz!create_causal_mask(**mask_kwargs)compute_mup_vectorsegment_sumapply_mask_to_padding_statesreshape_into_chunkspad_tensor_by_sizea  
# Unsloth auto generated code
# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

a"  
import os
import torch
import importlib.util
import math
if importlib.util.find_spec("unsloth_studio") is None:
    UNSLOTH_STUDIO_ENABLED = False
else:
    UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
pass
from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
import math

UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)

import logging
logger_compiler = logging.getLogger(__name__)
if UNSLOTH_ENABLE_LOGGING:
    logger_compiler.setLevel(logging.DEBUG)

global INFERENCE_RUNS
INFERENCE_RUNS = 0

try:
    import torch._dynamo.eval_frame as torch_dynamo_eval_frame
    torch_dynamo_eval_frame._stance.stance
    torch_compiler_set_stance = torch.compiler.set_stance
except:
    torch_dynamo_eval_frame = None
    torch_compiler_set_stance = None
pass

from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
a,  

from unsloth_zoo.loss_utils import (
    fused_linear_cross_entropy,
    unsloth_fused_ce_loss,
)

if UNSLOTH_STUDIO_ENABLED:
    from unsloth_zoo.loss_utils import fast_linear_cross_entropy

scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
@torch.compiler.disable(recursive = False)
def disable_compile_scaled_dot_product_attention(*args, **kwargs):
    return scaled_dot_product_attention(*args, **kwargs)
pass


from transformers.modeling_flash_attention_utils import is_flash_attn_available

if is_flash_attn_available():
    try:
        from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask
    except:
        flash_attn_supports_top_left_mask = None
    try:
        from transformers.modeling_flash_attention_utils import _flash_attention_forward
    except:
        _flash_attention_forward = None
    try:
        from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
    except:
        FlashAttentionKwargs = None
    try:
        from transformers.modeling_flash_attention_utils import flash_attn_varlen_func
    except:
        flash_attn_varlen_func = None
else:
    flash_attn_supports_top_left_mask = None
    _flash_attention_forward = None
    FlashAttentionKwargs = None
    flash_attn_varlen_func = None
pass

)Conv1dConv2dConv3dConvTranspose1dConvTranspose2dConvTranspose3dBatchNorm1dBatchNorm2dBatchNorm3d	GroupNormRMSNorm	LayerNormc                      g r)   r;   argskwargss     r-   no_update_causal_maskrS      s    4r0   c                    S[         R                  R                  R                  R                  ;  a  U$ Sn[
        R                  " X![
        R                  [
        R                  -  S9n[        U5      S:X  a  US   nUS   R                  S5      S:  a  US   R                  S5      S:  ai  [        S	U  S
35        Un[
        R                  " USU[
        R                  [
        R                  -  S9nUR                  SS5      R                  SS5      n  [
        R                  " SSU[
        R                  [
        R                  -  S9nU$ )N
enable_gqaz(key_states \= repeat_kv[^\n]{1,}\n[\s]{1,}value_states \= repeat_kv[^\n]{1,}\n[\s]{1,}(.+?)query_states \= query_states\.contiguous\(\)\n[\s]{1,}key_states \= key_states\.contiguous\(\)\n[\s]{1,}value_states \= value_states\.contiguous\(\))flagsr   r   zkey_states =    zvalue_states = zUnsloth: Transforming .z\2pass\nz1dropout_p=self.dropout if self.training else 0.0,z\dropout_p=self.dropout if self.training else 0.0, enable_gqa=self.num_key_value_groups != 1,z;dropout_p=self.attention_dropout if self.training else 0.0,zfdropout_p=self.attention_dropout if self.training else 0.0, enable_gqa=self.num_key_value_groups != 1,7if output_attentions\:.+?return super\(\)\.forward.+?\)Bif output_attentions: raise RuntimeError('Unsloth: Not supported'))torchnn
functionalscaled_dot_product_attention__doc__refindallDOTALL	MULTILINElencountprintsubreplace)modulesourcegrouped_query_attention_finderfound
all_sources        r-   $replace_with_grouped_query_attentionro      s8   588..KKSSS\bUb	9 # JJ5ryySUS_S_G_aE
5zQa8>>/*a/E!HNNCT4UYZ4Z*6(!45JVV.		BLL0	F GA 'QA  	VVBL		BLL(	F Mr0   c           	      2   [         (       d  U (       a  Sq [        R                  R                  [        5      n[        R                  R                  [        R                  " 5       U5      n[        R                  " SU S35        [        R                  " USS9  U[         4$ [        n [        R                  " USS9  U[         4$ ! [         aF  n[        R                  " S[         S[        U5       35        [        SS9u  nq  S nAU[         4$ S nAff = f)NTzUnsloth: We'll be using `z ` for temporary Unsloth patches.)exist_okz%Unsloth: Failed to create directory `z
` because use_tempfile)UNSLOTH_COMPILE_USE_TEMPospathbasenamer   jointempfile
gettempdirr   infomakedirs	Exceptionerrorstr_get_compile_folder)rs   leaflocationes       r-   r   r     s      <#' ww 8977<< 3 3 5t<'z1QR	
 	H. --- ,	ZKKT2555 	ZLL@AY@ZZdehijekdlmn 2ETX1Y.H.---	Zs   )C 
D5DDc                 .    [        S[        U 5      u  pX4$ )NrX   )r   r   )rs   r   rt   s      r-   get_compile_folderr   5  s    )=aATVb)c&H--r0   c                       SS K n [        U R                  5      nU Vs/ s H  o"R                  S5      (       d  M  UPM     sn$ s  snf !   / s $ = f)Nr   create)transformers.masking_utilsdirmasking_utils
startswith)transformersr   r3   s      r-   get_mask_functionsr   ;  sG    )L667(C=aLL,B=CCC	s!   A
 AAA
 A
 
Ac           	          [         R                  " SU 5      nU HG  nUR                  S5      UR                  5       pCUu  pVpxU SU SU SU S3n	U R	                  X95      n MI     U $ )a!  
Converts all softmax to float32 for eg:
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
probs = F.softmax(combined_logits, dim=-1, dtype=combined_logits.dtype)
routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
zf(nn\.functional\.softmax|F\.softmax)\(([^,]{1,}), (dim[ ]?\=[ ]?[\-0-9]{1,2})(\,[ ]?dtype[^\)]{1,})?\)r   (, z, dtype = torch.float32).to(z.dtype)ra   finditergroupgroupsri   )
rk   softmax_objectsitem
full_matchmatchessoftmaxvariabledimdtypenews
             r-   higher_precision_softmaxr   F  s|     kk	 	O  "jjmT[[]G(/%3	8*Bse+GzQXY
0	  
 Mr0   c                    [         R                  " SU 5      nU HP  nUR                  S5      UR                  5       pCUu  pVpxpn
SU SU SU SU	 SU
 SU S	3nU R	                  X;5      n MR       [         R                  " S
U 5      nU Hi  nUR                  S5      UR                  5       pCUu
  ppgppnnSU;   a  M4  U SU SU SU U	 SU U SU SU SU SU SU S3nU R	                  X;5      n Mk     U $ )z
Converts all sqrt(mean(X**2)) to float32
torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5
z|(torch\.mean|torch\.sum)\(([a-zA-Z0-9\_\[\]]{1,})[ ]{0,}(\*\*)[ ]{0,}([\d]{1,})([^\)]{0,})\)[ ]{0,}(\*\*)[ ]{0,}([\d\.]{1,})r   r   z(((z).to(torch.float32)**)z)**(z)).to((z).dtype)z([a-zA-Z0-9\_]{1,})[ ]{0,}\=[ ]{0,}(torch\.mean|torch\.sum)\(([a-zA-Z0-9\_\[\]]{1,})[ ]{0,}(\*\*)[ ]{0,}([\d]{1,})([^\)]{0,})\)([\n ]{1,})\1[ ]{0,}\=[ ]{0,}(torch.sqrt)\((.*?)\1(.*?)\)\n
 = z((z).to(torch.float32)z.to((z
).dtype))
r   )rk   sqrt_mean_objectsr   r   r   meanr   _powerrestdivisorr   new_variablespacessqrtinnerendings                    r-   higher_precision_sqrt_meanr   a  sX    		 	 ""jjmT[[]G5<2$7$s8*$95!D6gYV]^f]ggop
0	 "
 	
 	 	" ""jjmT[[]GT[QHfE66>8nCvRz1FugdVSThnCvQugQ|n<OheH:[2 	
 
0 " Mr0   c                 &   SU ;   d  SU ;   Ga  [         R                  R                  SS5      S:w  a  [         R                  S   nUR                  S5      S:  d   eUR	                  SS5      u  p#pEnUS:H  nUS:H  =(       d    US	:H  =(       a#    [         R                  R                  S
S5      S:H  nU(       d  U(       aQ  [        U5      bE  [        U5      n	U	[        R                  :X  a&  U R                  SS5      n U R                  SS5      n U $ U $ )Nzcos.tozsin.toUNSLOTH_FORCE_CUSTOM_DTYPE ;   allfloat16ztorch.float16UNSLOTH_FORCE_FLOAT32r$   1zcos.to(dtype=x.dtype)zDcos.to(dtype=torch.float16 if x.dtype == torch.float32 else x.dtype)zsin.to(dtype=x.dtype)zDsin.to(dtype=torch.float16 if x.dtype == torch.float32 else x.dtype))	ru   environgetrf   splitevalr\   float32ri   )
rk   custom_datatypechecker_dtype_bnb_compute_dtype_custom_datatypeexecute_codeallow_all_runsallow_float16_runsr   s
             r-   fix_rotary_embedding_dtyper     s    6X/::>>6;rA jj)EFO"((-222RaRgRghkmnRoOG/<%.N I%CO)C F 7=D  !3<+ LE-!'3b" "(3b"  &Mr0   c                    [        [        R                  " SU [        R                  [        R                  -  S95      n[        U5      S:X  a  U $ US   nUR                  S5      u  p4U R                  SU5      nXU n[        R                  nSU;   a  [        R                  nOlSU;   a  [        R                  nOUSU;   a  [        R                  nO>SU;   a  [        R                  nO'S	U;   a  [        R                  nO[        R                  n[        R                  R                  S
S5      S:H  nU[        R                  :X  a  SnU(       a  [        S5        U(       a  SOS[        R                  S
'   g )NzV\nclass[^\(\n]{1,}Norm\(nn\.Module\).+?def __init__.+?self.weight.+?\nclass[^\(\n]{1,}rV   r   z
classzself.weight.to(torch.float32)z!(self.weight * hidden_states).to(zself.weight * hidden_states.to(zself.weight.float()zreturn output * self.weight UNSLOTH_HIGH_PRECISION_LAYERNORMr$   r   Tz/Unsloth: Upcasting layernorm weights to float32)listra   r   rc   rd   re   spanfindr\   r   r   ru   r   r   rg   )modeling_filenorm_modulesnorm_modulestartendr   higher_precisions          r-   higher_precision_layernormsr     s3   	! 			BLL( L <Am3q/K!!!$JE


Y
,C,KMME&+5	,	;	*k	9	+	-	&+	5 zz~~&H#NRUU?@<LSRUBJJ12r0   a"  
if hasattr(logger, "addFilter"):
    import logging
    class HideLoggingMessage(logging.Filter):
        def __init__(self, text): self.text = text
        def filter(self, x): return not (self.text in x.getMessage())
    pass
    logger.addFilter(HideLoggingMessage("`use_cache=True`"))
c           	      2  ^$ Un[         R                  R                  SS5      S:H  n	[        U5      nUS   S:X  a=  UR	                  S5      m$UR                  S5      nSR                  U$4S jU 5       5      n U(       a  S	U 3n U V
s/ s H  oU;   d  M
  X:w  d  M  S
U
 S3U;  d  M  U
PM!     nn
SU;   a  US/-  n[        5       nU H  nX;   d  M
  X/-  nM      SnUS-  nUS-  nUS-  nUS-  nU[        U5      S:w  a"  SU S3SR                  S U 5       5      -   S-   OS-  nUS-   U-   nSU;   a  US-   [        -   S-   nXA-   U-   n[        S5      n[        S5      n[        S5      n[        S5      nSU SU SU SU S3-   S-   n[        U;  a  U[        -   U-   nOUU-   nS n[        S S!9u  nq[         R                  R                  UU  S"35      nU(       d  [         R                  R                  U5      (       ak  [!        US#S$S%9 nUR#                  5       nS S S 5        UU:w  a  S&nO>U(       d7  S'U;  a  S&nO.US UR	                  S'5       nUS UR	                  S'5       U:w  a  S&n [         R                  R                  S(S5      S:X  a  S nS) n U(       d$  [         R                  R                  U5      (       d   [%        S*UUU5          S nS nS+ n  U" UU 5      u  nnUb  U[:        l        Uc  [C        S2U  S3[D         35      eU$ s  sn
f !   Sn GN= f!   Sn GN= f!   Sn GN= f!   Sn GN= f! , (       d  f       GN'= f! [&         a]  n[        (       a  [)        U5      e[        S&S!9u  nq[         R                  R                  UU  S"35      n[%        S*UUU5         S nANS nAff = f! [&         Ga  nS n[        (       d  [        S&S!9u  nq[         R                  R                  UU  S"35      n[%        S*UUU5        [+        5       (       a  [,        R.                  " S,U  S-U S.35         U" UU 5      u  nnOE! [&         a8  nS n[+        5       (       a  [,        R.                  " S,U  S-U S/35         S nAOS nAff = f Uc   S0U  3n [         R                  R                  UU 5      S"-   n![1        U!5      n"U"   [2        R4                  R7                  U U!5      n#[2        R4                  R9                  U#5      nU[:        R<                  U '   U#R>                  RA                  U5        S S S 5        O4! , (       d  f       O&= f! [&         a  n[)        S1U  S-U 35      eS nAff = f S nAGNS nAff = f! Ub  U[:        l        f f = f)4NUNSLOTH_ENABLE_LOGGINGr$   r   r    defr   c              3   ,   >#    U  H	  oTS  v   M     g 7fr)   r;   .0r3   r   s     r-   	<genexpr>&create_new_function.<locals>.<genexpr>  s     >:az:   zR@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def r   SiglipEncoderzfrom torch import Tensor
zimport torch
zimport torch.nn as nn
z%from torch.nn import functional as F
zJfrom typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
zfrom z	 import (r   c              3   $   #    U  H  ov   M     g 7fr)   r;   r   r3   s     r-   r   r     s     <N1Q   r   r   

r   unsloth_zoounslothr   trlz"""
z
__UNSLOTH_VERSIONING__
Frr   .pyrutf-8)encodingT__UNSLOTH_VERSIONING__UNSLOTH_COMPILE_OVERWRITEc           	      ,   [        U 5      nUR                  S5      n U    [        R                  " U 5      nSnUb  UR
                  [        U5      :w  a  SnO([        U S5       nUR                  5       U:g  nS S S 5        U(       aY  [        U SSS9 nUR                  U5        UR                  5         [        R                  " UR                  5       5        S S S 5        S S S 5        g ! [         a  nS n S nANS nAff = f! , (       d  f       N= f! , (       d  f       NA= f! , (       d  f       g = f! [         aS  n[        R                  R                  SS	5      S
:X  a%  [        R                   " SU  S[#        U5       35         S nAg S nAff = f)Nr   FTrbwbr   )	bufferingUNSLOTH_LOGGING_ENABLEDr$   r   zUnsloth: Failed to write file 	 because )r   encoderu   statr}   st_sizere   openreadwriteflushfsyncfilenor   r   r   r~   r   )	function_locationwrite_new_sourcelocknew_write_bytesstr   
need_writeffiles	            r-   
write_file'create_new_function.<locals>.write_fileS  sO   )**11':	!23B #
:s?/C!C!%J/6!%&VVX%@
 7 /1E

?3

/ F &  ! B 76 FE &  	 zz~~7=D=>O=PPYZ]^_Z`Yabc	s   D6 D%C*-D%%D9D%ADD%!D6 *
D 4C;6D%;D  D%
D	D%
D"	D%%
D3/D6 3D6 6
F A	FFr   c           	      H   [         R                  R                  X S35      n[        U5      nU [        R                  ;  aM  [        [        R                  5      nX;   a  [        SU S35      e[        R                  R                  SU 5         U   [        R                  " U5      nUW4sS S S 5        $ ! , (       d  f       g = f! [         aP  n[         R                  R                  SS5      S:X  a%  [        R                  " SU S	[        U5       35        UeS nAff = f)
Nr   zUnsloth: File z already existsr   r   r$   r   z!Unsloth: Failed to import module r   )ru   rv   rx   r   sysr   OSErrorinsert	importlibimport_moduler}   r   r   r   r~   r   )compile_foldernametarget_namer   old_path
new_moduler   s          r-   r	  *create_new_function.<locals>.import_module  s    ggll>V3<@$)CHH~HtfODEEHHOOA~.	&44T:
!8+   	zz~~7=D@iPSTUPVxXYG	s7   C B6,	C 6
C C C 
D!ADD!zStandard import failed for z: z. Using tempfile instead!z(. Using spec.loader.exec_module instead!unsloth_cache_z!Direct module loading failed for zUnsloth: Cannot import z from )#ru   r   r   r   r   r   rx   r   re   disble_use_cache_loggingimportlib_version_full_license_headerr   rt   rv   isfiler   r   r   r}   RuntimeErrorr   r   r{   r   r  utilspec_from_file_locationmodule_from_specr  modulesloaderexec_moduleImportErrorr   )%r  
new_sourcemodel_location	functionsprependappend	overwriteadd_torch_compileold_new_source
do_loggingr3   itemsmask_functionsmask_functionimportsunsloth_zoo_versionunsloth_versiontransformers_versiontrl_version
versioningr   file_sourcer
  r   r   versionsr  r~   r  r  r	  r   module_namefile_locationr   specr   s%                                       @r-   r   r     s     N 8#>#EJ **5J!}'%%d+
YY>:>>
al 	 	 "m	1JQQYQSWXYWZZ[Q\`jQjQ	Em*$e/@&@e')N'&(@ (*GG((G77G\\GX[\aXbfgXg~&i0499<N<N3NNQTTmooG6!J.J5$&)AADH
%.J  1?	,Y7 1. A	(/ 
r
2
 -1	33 6=	=J :-%(<<zI%
2 K/AQV/W,N,^vS\B (9:: #SW=&&(K > **I'{: 	&'R(8(89Q(RSHz/GHIXU $I	zz~~1373>	< 	'899
	 J0ACST 	 HJ& 	# ,^TB
HB CH3D6@X?YZ[[k n. &#!cO&3 #K. >=h  	''"5)) <N]a;b8 8$&GGLLD6$N!$Q
4EGWX	N  
''7IY]7^4N4 "^vS\ J J0ACST  9$r!D]^_s'4^T'J$
H s!
"$$KK"=dV2aSHp qrs 	
T .tf5 "^T BU J.$>>AA+}]D!*!@!@!FJ/9CKK,KK++J7	 TT
  T"%FtfBqc#RSST;@ CH  s  
	LLL*LL L$ $L. /L8 MM !N> L!$L+.L58L?
M
N;AN66N;>V	A6U= QU=
R.R
U=
RU=5UA.U:UU=
U	UU=U
U4U//U44U=7V =VV Vreturnc	                 4  ^ [        U SU  35      n	[        R                  " U	5      n
[        R                  " U	R                  5      n[        R                  " U	R                  5      nUc  Un[
        (       a  SU;   a  Sn[        R                  " SSU  S3U5      n[        R                  " SU5      R                  S	5      S	   mUR                  S
5      nS
R                  U4S jU 5       5      nSU;   a  SnUb  U(       d  SU S3OSnOSn[        R                  " U	R                  5      R                  n[        UR                  5       5      n[        UR!                  5       5      n[#        U5       HW  u  nn[%        U5      nUR'                  S5      (       a  SUU   -   UU'   M4  UR'                  S5      (       d  ML  SUU   -   UU'   MY      SR                  U5      n[        R(                  " SU[        R*                  S9S	   nXR-                  U5      [/        U5      -   S nU(       a;  SU;  a5  US-  n[        R                  " SSU5      n[        R                  " SSU5      n U S
U S
3n[        R0                  " SU5      R                  5       S   nUUSU -   SU  SU S3-   nU
R3                  UU5      n
Ub  U
R3                  X5      n
Ubb  [5        U[6        5      (       aM  UR9                  5        H9  u  nn [        R                  " [;        U	U5      5      nU
R3                  UU5      n
M;     X-   n[        R                  " S%SU5      n[        R                  " S&SU5      nUR3                  S'S(5      n[        R                  " S)S*U5      n[G        U5      n[I        U5      n[K        U5      nU$ ! [<         aT  n[>        R@                  RC                  SS 5      S!:X  a$  [E        S"U S#U  S$[%        U5       35         SnAGM   SnAGM  SnAff = f)+a!  
new_methods: dict[str, str] = {
    "method_name": "method_source",
}
 method_name needs to be a valid attribute of the module class and
 method_source is the source code of the method it will be an exact string
 replacement so indentation and whitespace should be handled ahead of time!
rY   Nznn.Embedding(Tdef forwardr   _forwardz[^\s\n]r   r   c              3   ,   >#    U  H	  oTS  v   M     g 7fr)   r;   r   s     r-   r   *create_standalone_class.<locals>.<genexpr>  s     26az6r   cuda_kernels_forward@torch.compile(fullgraph = z2, dynamic = True, options = torch_compile_options)z*@torch.compiler.disable(recursive = False)r   z***r   z.[\s\n]{1,}def[^\(]{1,}\([^\)]{1,}\)[^\:]{0,}\:rV   z, **loss_kwargsz(\,[\n][\s]{1,}\))z,**loss_kwargs\1z(\,[\n]\) \-\>)z
[\s\n]{4,}r   zreturn z	_forward(z)
r   r$   r   z"Unsloth: Failed to replace method z in  with error = z'@auto_docstring[\s]{0,}(\([^\)]{0,}\))?z+@check_model_inputs[\s]{0,}(\([^\)]{0,}\))?zself.config.ignore_indexz-100zKself\.([A-Za-z\_]{0,}embedding)\(input_ids (\-|\+) (self\.[A-Za-z\_]{1,})\)z$self.\1((input_ids \2 \3).clamp_(0)))&r   inspect	getsourceforwardr.   OLD_TORCH_VERSIONra   rh   searchr   r   rx   	signature
parametersr   keysvalues	enumerater   r   rb   rd   r   re   matchri   
isinstancedictr&  getattrr}   ru   r   r   rg   r   r   r   )rj   r  r  	fullgraphforward_sourcedisableadd_loss_kwargsnew_initnew_methodsr   
full_class
old_sourceold_initrk   compilerD  rE  rF  jvalue
definitionleftoverleftnew_forwardmethod_namemethod_sourceold_method_sourcer   r   s                               @r-   create_standalone_classr_    s   * 	q)*A""1%J""199-J""1::.H
~ _8VV
vhhF
 YYz6*//215F\\$FYY2622F '  *)4fg8 	
  ""199-88J
!"D*##%&Ff%5E
d##td1g~T!Wc""sd1g~T!W & 	4J MzcecocopqrsJ//*5JGHIH 4z1''
VV13F
S
*,?Hy6("%F88M8,113A6Dx.
&:,c23K##J<J '';
 :k4#@#@*5*;*;*=&Kp$+$5$5ga6M$N!'//0A=Q
 +>  F VV>FKFVVBBOF ^^6?F VVV/F &f-F (/F (/FMA  p::>>";SASH>{m4PVxWefijkflemnoo Ips   2N99
PAPPa	  
from torch.nn import CrossEntropyLoss

@torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
def normal_cross_entropy_loss(self, hidden_states, labels):
    logits = self.lm_head(hidden_states)
    logits = logits.float()
    # Shift so that tokens < n predict n
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    # Flatten the tokens
    loss_fct = CrossEntropyLoss()
    shift_logits = shift_logits.view(-1, self.config.vocab_size)
    shift_labels = shift_labels.view(-1)
    # Enable model parallelism
    shift_labels = shift_labels.to(shift_logits.device)
    loss = loss_fct(shift_logits, shift_labels)
    return loss, logits
pass

# We need an empty logits flag to warn people logits will not be returned anymore unless asked ie
# os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
LOGITS_ERROR_STRING = \
    "Unsloth: Logits are empty from 2024.11 onwards. To get raw logits again, please "\
    'set the environment variable `UNSLOTH_RETURN_LOGITS` to `"1" BEFORE starting to train ie before `trainer.train()`. For example:\n'\
    "```\nimport os\n"\
    "os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n"\
    "trainer.train()\n```\n"\
    "No need to restart your console - just add `os.environ['UNSLOTH_RETURN_LOGITS'] = '1'` before trainer.train() and re-run the cell!"

def raise_logits_error(*args, **kwargs): raise NotImplementedError(LOGITS_ERROR_STRING)
def return_none(*args, **kwargs): return None
class EmptyLogits:
    def __init__(self): return
    def raise_getattr_error(self, attr): return return_none if attr == "to" else raise_logits_error
    __getitem__ = raise_logits_error
    __getattr__ = raise_getattr_error
    def __repr__(self): return LOGITS_ERROR_STRING
    def __str__ (self): return LOGITS_ERROR_STRING
pass
EMPTY_LOGITS = EmptyLogits()
functions = dir(torch.Tensor)
for j, function in enumerate(functions):
    if function.startswith("__") and function.endswith("__"):
        exec(f"def raise_{j}(*args, **kwargs): print('{function}')", globals(), locals())
        try: exec(f"EMPTY_LOGITS.{function} = raise_{j}", globals(), locals())
        except: continue
pass


def mask_attention_mask_out(labels = None, attention_mask = None):
    if labels is not None and attention_mask is not None:
        attention_mask = attention_mask.to(device = labels.device)
        labels[attention_mask == 0] = -100
    return labels
pass

a  

    # Set compiler stance to fail on recompiles for inference
    global INFERENCE_RUNS
    if torch_dynamo_eval_frame is not None:
        old_stance = torch_dynamo_eval_frame._stance.stance
    else:
        old_stance = None
    if old_stance is not None and INFERENCE_RUNS == 1:
        # Skip guards and return to eager -> we still need guards!
        torch_compiler_set_stance(stance = "eager_on_recompile", skip_guard_eval_unsafe = False)
        if UNSLOTH_ENABLE_LOGGING:
            logger_compiler.info(
                f"Unsloth: Removing compiler guards after 1 inference run. "\
                f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} "\
                f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
            )
    elif old_stance == "eager_on_recompile":
        pass
    elif old_stance == "default" and INFERENCE_RUNS > 1:
        # Reset compiler stance
        torch_compiler_set_stance(stance = "default", skip_guard_eval_unsafe = False)
        if UNSLOTH_ENABLE_LOGGING:
            logger_compiler.info(
                f"Unsloth: Reseting guards. "\
                f"DYNAMO_STANCE.stance = {torch_dynamo_eval_frame._stance.stance} "\
                f"DYNAMO_STANCE.skip_guard_eval_unsafe = {torch_dynamo_eval_frame._stance.skip_guard_eval_unsafe}"
            )
        INFERENCE_RUNS = 0
    INFERENCE_RUNS += 1
a  
logits = self.lm_head(hidden_states$INDEXING$
$LOGITSCALINGMULTIPLY$
$LOGITSCALINGDIVISION$
$LOGITSOFTCAPPING$
loss = None
if labels is not None:$SPACES$
$UPCASTING$
$LOGITSUPCAST$
$LABELSDEVICE$
shift_logits = logits[..., :-1, :]$CONTIGUOUS$
shift_labels = labels[..., 1:]$CONTIGUOUS$
loss_fct = $CROSSENTROPYLOSS$
shift_logits = shift_logits.view(-1, $VOCABSIZE$)
shift_labels = shift_labels.view(-1)
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
ah
  
NOT_RETURN_LOGITS = os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '0'
RETURN_HIDDEN_STATES = os.environ.get("UNSLOTH_RETURN_HIDDEN_STATES", "0") == "1"

n_items = None
all_locals = locals()
if 'loss_kwargs' in all_locals:
    __kwargs = all_locals['loss_kwargs']
    if type(__kwargs) is dict:
        n_items = __kwargs.get("num_items_in_batch", None)
        if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None and 'kwargs' in all_locals:
    __kwargs = all_locals['kwargs']
    if type(__kwargs) is dict:
        n_items = __kwargs.get("num_items_in_batch", None)
        if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None:
    all_locals = all_locals.values()
    for __kwargs in all_locals:
        if type(__kwargs) is dict:
            n_items = __kwargs.get("num_items_in_batch", None)
            if n_items is None: n_items = __kwargs.get("n_items", None)
            break
pass

requires_grad_ = self.lm_head.weight.requires_grad
requires_grad_ = requires_grad_ or self.lm_head.weight.dtype == torch.float32

if RETURN_HIDDEN_STATES:
    logits = hidden_states\1
elif labels is None:
    __DYNAMO__RECOMPILING__
    logits = self.lm_head(hidden_states\1)
elif ((\2) == () and (\3) == ()) and (UNSLOTH_ENABLE_CCE) and NOT_RETURN_LOGITS and self.loss_function.__name__.endswith("ForCausalLMLoss") and labels is not None and not requires_grad_:
    loss = fused_linear_cross_entropy(
        hidden_states      = hidden_states\1,
        lm_weight          = self.lm_head.weight,
        labels             = labels.to(self.lm_head.weight.device),
        num_items_in_batch = n_items,
        logit_softcapping  = None if (\4) == () else (\4),
    )
else:
    lm_head_weight = self.lm_head.weight
    lm_head_bias   = getattr(self.lm_head, "bias", None)

    # ========= NEW fused =========
    _hidden_states = hidden_states\1
    torch._dynamo.mark_dynamic(_hidden_states, 1)
    torch._dynamo.mark_dynamic(labels, 1)
    loss = unsloth_fused_ce_loss(
        trainer              = None,
        hidden_states        = _hidden_states,
        lm_head_weight       = lm_head_weight,
        lm_head_bias         = lm_head_bias,
        labels               = labels,
        mask                 = None,
        n_items              = n_items,
        scaling              = getattr(self, "accelerator_scaler", None),
        target_gb            = None,
        torch_compile        = not UNSLOTH_COMPILE_DISABLE,
        logit_scale_multiply = (\2) if (\2) != () else 0,
        logit_scale_divide   = (\3) if (\3) != () else 0,
        logit_softcapping    = (\4) if (\4) != () else 0,
    )
__DYNAMO__RECOMPILING__z
logits = self.lm_head(hidden_states$INDEXING$
$LOGITSCALINGMULTIPLY$
$LOGITSCALINGDIVISION$
$LOGITSOFTCAPPING$
loss = None
if labels is not None:$SPACES$loss = self.loss_function($NEWLINES$$LOGITS$, $LABELS$, $VOCABSIZE$$KWARGS$$NEWLINES$)
a  
NOT_RETURN_LOGITS = os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '0'
RETURN_HIDDEN_STATES = os.environ.get("UNSLOTH_RETURN_HIDDEN_STATES", "0") == "1"

n_items = None
if (\9) != () and type(\9) is dict:
    n_items = (\9).get("num_items_in_batch", None) or (\9).get("n_items", None)
if n_items is None:
    all_locals = locals()
    if 'loss_kwargs' in all_locals:
        __kwargs = all_locals['loss_kwargs']
        if type(__kwargs) is dict:
            n_items = __kwargs.get("num_items_in_batch", None)
            if n_items is None: n_items = __kwargs.get("n_items", None)
    if n_items is None and 'kwargs' in all_locals:
        __kwargs = all_locals['kwargs']
        if type(__kwargs) is dict:
            n_items = __kwargs.get("num_items_in_batch", None)
            if n_items is None: n_items = __kwargs.get("n_items", None)
    if n_items is None:
        all_locals = all_locals.values()
        for __kwargs in all_locals:
            if type(__kwargs) is dict:
                n_items = __kwargs.get("num_items_in_batch", None)
                if n_items is None: n_items = __kwargs.get("n_items", None)
                break
pass

requires_grad_ = self.lm_head.weight.requires_grad
requires_grad_ = requires_grad_ or self.lm_head.weight.dtype == torch.float32

if RETURN_HIDDEN_STATES:
    logits = hidden_states\1
elif labels is None:
    __DYNAMO__RECOMPILING__
    logits = self.lm_head(hidden_states\1)
elif ((\2) == () and (\3) == ()) and (UNSLOTH_ENABLE_CCE) and NOT_RETURN_LOGITS and self.loss_function.__name__.endswith("ForCausalLMLoss") and labels is not None and not requires_grad_:
    loss = fused_linear_cross_entropy(
        hidden_states      = hidden_states\1,
        lm_weight          = self.lm_head.weight,
        labels             = labels.to(self.lm_head.weight.device),
        num_items_in_batch = n_items,
        logit_softcapping  = None if (\4) == () else (\4),
    )
elif self.loss_function.__name__.endswith("ForCausalLMLoss") and labels is not None:
    lm_head_weight = self.lm_head.weight
    lm_head_bias   = getattr(self.lm_head, "bias", None)

    # ========= NEW fused =========
    _hidden_states = hidden_states\1
    torch._dynamo.mark_dynamic(_hidden_states, 1)
    torch._dynamo.mark_dynamic(labels, 1)
    loss = unsloth_fused_ce_loss(
        trainer              = None,
        hidden_states        = _hidden_states,
        lm_head_weight       = lm_head_weight,
        lm_head_bias         = lm_head_bias,
        labels               = labels,
        mask                 = None,
        n_items              = n_items,
        scaling              = getattr(self, "accelerator_scaler", None),
        target_gb            = None,
        torch_compile        = not UNSLOTH_COMPILE_DISABLE,
        logit_scale_multiply = (\2) if (\2) != () else 0,
        logit_scale_divide   = (\3) if (\3) != () else 0,
        logit_softcapping    = (\4) if (\4) != () else 0,
    )
else:
    logits = self.lm_head(hidden_states\1)
    if (\2) != ():
        logits = logits * (\2)
    if (\3) != ():
        logits = logits / (\3)
    if (\4) not in (None, (),):
        logits = logits / (\4)
        logits = torch.tanh(logits)
        logits = logits * (\4)
    loss = self.loss_function(\6, \7.to(self.lm_head.weight.device), vocab_size=\8, **\9)
a  
$OUTPUTLOGITS$
$LOGITSCALINGMULTIPLY$
$LOGITSCALINGDIVISION$
$LOGITSOFTCAPPING$
loss = None
if labels is not None:$SPACES$
$UPCASTING$
$LOGITSUPCAST$
$LABELSDEVICE$
$LOGITSHIFTING$
$VLMATTENTIONMASK$
loss_fct = $CROSSENTROPYLOSS$
shift_logits = shift_logits.view(-1, $VOCABSIZE$)
shift_labels = shift_labels.view(-1)###
$LOGITSDEVICE$###
loss = loss_fct(shift_logits, shift_labels)
a  
NOT_RETURN_LOGITS = os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '0'
RETURN_HIDDEN_STATES = os.environ.get("UNSLOTH_RETURN_HIDDEN_STATES", "0") == "1"

all_locals = locals()
n_items = None
if 'loss_kwargs' in all_locals:
    __kwargs = all_locals['loss_kwargs']
    if type(__kwargs) is dict:
        n_items = __kwargs.get("num_items_in_batch", None)
        if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None and 'kwargs' in all_locals:
    __kwargs = all_locals['kwargs']
    if type(__kwargs) is dict:
        n_items = __kwargs.get("num_items_in_batch", None)
        if n_items is None: n_items = __kwargs.get("n_items", None)
if n_items is None:
    all_locals = all_locals.values()
    for __kwargs in all_locals:
        if type(__kwargs) is dict:
            n_items = __kwargs.get("num_items_in_batch", None)
            if n_items is None: n_items = __kwargs.get("n_items", None)
            break
pass

requires_grad_ = self.lm_head.weight.requires_grad
requires_grad_ = requires_grad_ or self.lm_head.weight.dtype == torch.float32

if RETURN_HIDDEN_STATES:
    logits = hidden_states\1
elif labels is None:
    __DYNAMO__RECOMPILING__
    logits = self.lm_head(hidden_states\1)
else:
    lm_head_weight = self.lm_head.weight
    lm_head_bias   = getattr(self.lm_head, "bias", None)

    # ========= NEW fused =========
    _hidden_states = hidden_states\1
    torch._dynamo.mark_dynamic(_hidden_states, 1)
    torch._dynamo.mark_dynamic(labels, 1)
    if attention_mask is not None:
        torch._dynamo.mark_dynamic(attention_mask, 1)
    loss = unsloth_fused_ce_loss(
        trainer              = None,
        hidden_states        = _hidden_states,
        lm_head_weight       = lm_head_weight,
        lm_head_bias         = lm_head_bias,
        labels               = labels,
        mask                 = \6,
        n_items              = n_items,
        scaling              = getattr(self, "accelerator_scaler", None),
        target_gb            = None,
        torch_compile        = not UNSLOTH_COMPILE_DISABLE,
        logit_scale_multiply = (\2) if (\2) != () else 0,
        logit_scale_divide   = (\3) if (\3) != () else 0,
        logit_softcapping    = (\4) if (\4) != () else 0,
    )
c                 
  ^	 [         R                  R                  SS5      S:H  n[        [        5       GH  u  nu  pEUR                  5       R                  SS5      R                  SS5      R                  SS	5      R                  S
S5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      nUR                  SS5      R                  S S!5      R                  S"S#5      R                  S$S%5      R                  S&S'5      R                  S(S)5      R                  S*S+5      R                  S,S-5      R                  S.S/5      R                  S0S15      R                  S2S35      R                  S4S55      R                  S6S75      R                  S8S95      R                  S:S;5      R                  S<S=5      R                  S>S?5      R                  SS5      R                  S@SA5      R                  SBSC5      R                  SDSE5      R                  SFS5      R                  SGSH5      R                  SISJ5      R                  SKSL5      nUR                  S*SM5      nU R                  SNSO5      n SPU;   a  SQU ;  a  U(       a  [        SRU 35        GM  SPU;  a  SQU ;   a  U(       a  [        SSU 35        GM  STU;  a  STU ;   a  U(       a  [        SUU 35        GM  STU;   a  STU ;  a  U(       a  [        SVU 35        GM/   [        R                  " UU [        R                  [        R                  -  SWSX9n[        U5      SZ:X  a  GMx  U(       a  [        SUSW-    S[U 35         USZ   S\   m	T	R                  S]5      [        T	5      :w  a  USZ   S^   m	UR                  5       R!                  S5      nSR#                  U	4S_ jU 5       5      nS`U ;   a  Sa[        T	5      S\-
  S]-  -   Sb-   U-   S-   nOSc[        T	5      S\-
  S]-  -   Sb-   U-   S-   n [        R$                  " UUU [        R                  [        R                  -  Sd9n SeU;   a$  U R                  SfSe5      n U R                  SgSe5      n [        R$                  " ShSiU 5      n U R                  SjS5      n U R                  SkS5      n U R                  SlS5      n U s  $     U $ ! [         a*  nU(       a  [        SY[        U5       35         S nAGM$  S nAff = f!    GM2  = f)mNr   r$   r   r<  z\*^z\^-z\-r   z\_:z\:+z\+rY   z\.,z\,r   z\(r   \)[z\[]z\]r   z0(?:[\s\n]{0,}(?:\#[^\n]{1,}[\n][\s\n]{1,})?){0,}z
$INDEXING$z,([^\n^\)]{0,})\)(?:\.float\(\))?[\n][\s]{0,}z$UPCASTING$z(?:\.float\(\))?z$SPACES$z,[\n]([\s]{1,})(?:\#[^\n]{1,}[\n][\s\n]{1,})?z$LOGITS$z(logits=logits|logits)z$LABELS$z(labels=labels|labels)z$VOCABSIZE$z{(?:vocab_size\=)?(self\.config\.vocab_size|self\.vocab_size|self\.config\.vocab_size|self\.config\.text_config\.vocab_size)z$KWARGS$z(?:, \*\*(loss_kwargs|kwargs))?z$LOGITSUPCAST$z(?:logits = logits\.float\(\))?z$LABELSDEVICE$z%(?:labels = labels\.to\([^\)]{1,}\))?z$LOGITSCALINGMULTIPLY$z@(?:[\n\s]{0,}logits = logits \* (self\.[^ \n]{1,})[^\n]{0,})?###z$LOGITSCALINGDIVISION$z@(?:[\n\s]{0,}logits = logits \/ (self\.[^ \n]{1,})[^\n]{0,})?###z$LOGITSOFTCAPPING$z(?:[\n\s]{0,}(?:if self\.[^\n\s]{1,} is not None:\n)?[\s\n]{0,}logits = logits \/ (self\.[^ \n]{1,})\n[\s\n]{0,}logits = torch\.tanh\(logits\)\n[\s\n]{0,}logits = logits \* self\.[^ \n]{1,}\n)?z$CROSSENTROPYLOSS$zQ(?:CrossEntropyLoss\(\)|nn\.CrossEntropyLoss\(\)|torch\.nn\.CrossEntropyLoss\(\))z$VLMATTENTIONMASK$a  (?:(?:shift_logits = logits\[\.\.\.\, :-1, :\]$CONTIGUOUS$shift_labels = labels\[\.\.\.\, 1:\]$CONTIGUOUS$)?if ([a-zA-Z\_]{1,}_mask) is not None:###shift_attention_mask = @@@###shift_logits = @@@###shift_labels = @@@###else:###shift_logits = [^\n]{1,}###shift_labels = [^\n]{1,}###)?z$LOGITSHIFTING$zo(?:shift_logits = logits\[\.\.\.\, :-1, :\]$CONTIGUOUS$###shift_labels = labels\[\.\.\.\, 1:\]$CONTIGUOUS$###)?z$LOGITSDEVICE$zB(?:\.to\([^\)]{1,}\)|shift_labels = shift_labels\.to\([^\)]{1,}\))z$OUTPUTLOGITS$zt(?:logits = outputs\.logits|logits = self\.lm_head\(hidden_states\)|logits = self\.lm_head\(hidden_states$INDEXING$)shift_z(?:shift_|flat_)z$CONTIGUOUS$z(?:\.contiguous\(\))?zshift\_z(?:shift\_|flat\_)z###z@@@z![^\[]{1,}\[[^\]]{1,}\][^\n]{0,}\nz$EMPTY$z()z
$NEWLINES$z
[\s\n]{0,}z=locals().get('loss_kwargs', {}) or locals().get('kwargs', {})zTloss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))zshift_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
shift_labels = shift_labels.view(-1)
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)zloss\_functionloss_functionz<(1) Unsloth skipping patching fast linear cross entropy for z<(2) Unsloth skipping patching fast linear cross entropy for CrossEntropyLossz<(3) Unsloth skipping patching fast linear cross entropy for z<(4) Unsloth skipping patching fast linear cross entropy for r   )rW   timeoutz>Unsloth failed patching fast linear cross entropy with error: r   z?/3 pattern] Successfully patched fast linear cross entropy for r   r      c              3   L   >#    U  H  n[        T5      S -
  S-  U-   v   M     g7f)r   r   N)re   r   s     r-   r   &apply_fused_lm_head.<locals>.<genexpr>4  s#     MAVQ 3a 7s   !$slice_indiceszlogits = self.lm_head(hidden_states[:, slice_indices, :]) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITS
zloss = None
znlogits = self.lm_head(hidden_states) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITS
rV   zlogits = outputs.logitszlogits = self.lm_head(hidden_states[:, slice_indices, :]) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITSzmlogits = self.lm_head(hidden_states) if os.environ.get('UNSLOTH_RETURN_LOGITS', '0') == '1' else EMPTY_LOGITSz-vocab_size[ ]{0,}=[ ]{0,}\(vocab_size[ ]{0,}=zvocab_size = (z, **)z,**)z,** ))ru   r   r   rG  
ce_findersstripri   rg   regexrb   rc   rd   r}   r   re   rf   r   rx   rh   )
r@  rj   r   jjcross_entropy_findcross_entropy_replacementfinderr   replacementr   s
            @r-   apply_fused_lm_headrz    sy   ZZ^^,DcJcQ?H?T;;/557WS% e!4WS% e!4WS% e!4WS% e!4WS% e!4WS% e!4WC 	 0W\'VWW]':;WZ'VWWZ'@AWZ'@AW] WZ'IJW%'IJW%'OPW-XZW-XZW)JK
 W)
 W* W'
 W& W& W\DFWY 34W^'?@WZ!67WVPQWVABWZ'W]M2I 	N %>WO 	" //b:
  22g7U%TU[T\]^&88_PW=W%TU[T\]^'99>PT[>[%TU[T\]^#55:LT[:[%TU[T\]^
	]]"u6	F v;!X!AbdVZ[aZbcd 1<<F+AYq\F/557==dCiiMMMg% WVQ#$&56"#  BVQ#$&56"# 	ii"u6	G %(::oo U)G oo)G
 ))<
 //'3///&#.//'3/i @Uj 	Ns  	%VWZ[\W]V^_`	@	s$   $6T)26U )
U3UU U&c                  L   SS K n / nSSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJ	n  UR                  U5        SSK
Jn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SS	KJn	  UR                  U	5        SS
KJn
  UR                  U
5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSKJn  UR                  U5        SSK J!n  UR                  U5        SSK"J#n  UR                  U5        U Vs/ s H+  nURH                  U RJ                  " URL                  5      4PM-     nnU H=  u  nn[O        UU5      nSU;  a  [Q        SU 35        SU;   d  M.  [Q        SU S35        M?     g s  snf )Nr   )Qwen2VLForConditionalGeneration)GraniteForCausalLM)Gemma2ForCausalLM)CohereForCausalLM)GemmaForCausalLM)LlamaForCausalLM)MistralForCausalLM)!PaliGemmaForConditionalGeneration)IdeficsForVisionText2Text) Idefics3ForConditionalGeneration) Mistral3ForConditionalGeneration)MllamaForConditionalGeneration)MllamaForCausalLM)Llama4ForCausalLM)Llama4ForConditionalGeneration)Qwen3ForCausalLM)"Qwen2_5_VLForConditionalGeneration)Gemma3ForConditionalGenerationNOT_RETURN_LOGITSz$Failed patching fast CE forward for zloss = outputs.lossz# since `loss = outputs.loss` exists))r>  .transformers.models.qwen2_vl.modeling_qwen2_vlr|  r!  ,transformers.models.granite.modeling_graniter}  *transformers.models.gemma2.modeling_gemma2r~  *transformers.models.cohere.modeling_coherer  (transformers.models.gemma.modeling_gemmar  (transformers.models.llama.modeling_llamar  ,transformers.models.mistral.modeling_mistralr  0transformers.models.paligemma.modeling_paligemmar  ,transformers.models.idefics.modeling_ideficsr  .transformers.models.idefics3.modeling_idefics3r  .transformers.models.mistral3.modeling_mistral3r  *transformers.models.mllama.modeling_mllamar  r  *transformers.models.llama4.modeling_llama4r  r  (transformers.models.qwen3.modeling_qwen3r  2transformers.models.qwen2_5_vl.modeling_qwen2_5_vlr  *transformers.models.gemma3.modeling_gemma3r  r6   r?  r@  rz  rg   )r>  forwardsr|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r@  s                          r-   test_apply_fused_lm_headr  c  s   H^OO34OOO&'LOO%&LOO%&IOO$%IOO$%OOO&'bOO56VOO-._OO45_OO45YOO23LOO%&LOO%&YOO23IOO$%eOO67YOO23EMNXW..qyy9;XHN!g &gt4g-8?@ G+8>abc " 	 Os   )2H!c                     [        [        R                  " SU 5      5      (       d  U $ [        [        R                  " SU 5      5      (       d  U $ SU ;   a  [        R                  " SSU 5      n U $ )Nz2attention_mask[\s]{0,}\=attention_mask[\s]{0,}\,\nz"labels[\s]{0,}\=labels[\s]{0,}\,\nForConditionalGenerationzRlabels=mask_attention_mask_out(labels = labels, attention_mask = attention_mask),
)re   ra   rb   rh   rk   s    r-   apply_mask_attention_mask_outr    sb    rzzOQWXYYbh[hrzz?HIIRX=!V+1a

 Mr0   c                    [         R                  " SSU5      n[         R                  " SSU5      nUR                  5       R                  S5      nUS   R                  5       nSU;  a  U$ [         R                  " SU5      n[        U5      S	:w  a  U$ US
   n/ nU HU  nU HL  n[         R                  " [         R                  " U5      S-   S-   U5      (       d  M;  UR                  U5        MN     MW      [        U5      S
:X  a  U$ [        U5      nUS   n	U H,  n[        U5      S
:X  a  M  U	R                  XwSU S3-   5      n	M.      XS'   SR                  U5      n
[        SU  35        U
$ )Nz
\([\s]{0,}r   z
[\s]{0,}\)r   r   r4  z6return[\s]{1,}(?:([^\,]{1,})\,[\s]{0,}){0,}([^\s]{1,})r   r   z.+?ztorch\.finfo\(.+?\)\.minz!=torch.finfo(z.dtype).minzUnsloth: Boolean mask for )ra   rh   rs  r   rb   re   rB  escaper!  setri   rx   rg   )rj   rS  rk   
all_splitssplitsvars	good_varsvarr   finalr  s              r-   convert_attention_masks_to_boolr    s^    VVM3
3FVVM3/F%%d+J^!!#Fvj0::OQWXD
4yA~j(7DIEyy3%/2MMuUU  %    	
9~:-IIrNEs8q=(c>#k)J#JK  	rN:&J	&vh
/0r0   )zhidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens,
                position_embeddings=position_embeddings,
                **kwargs,
            )zhidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens,
                rotary_pos_emb=rotary_pos_emb,
                position_embeddings=position_embeddings,
                **kwargs,
            ))zhidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens,
                position_embeddings=position_embeddings,
                attention_mask=attention_mask,
                **kwargs,
            )a  hidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens,
                rotary_pos_emb=rotary_pos_emb,
                position_embeddings=position_embeddings,
                attention_mask=attention_mask,
                **kwargs,
            )z
for LAYER in MODULELIST_ITEM:
$if self.gradient_checkpointing and self.training:
$    hidden_states = self._gradient_checkpointing_func(
$        LAYER.__call__, ARGS
$    )
$else:
$    hidden_states = LAYER(ARGS)
c                    [         R                  " UR                  5      nSU;  a  g [         R                  " UR                  5      nSU;   a  g [         H  u  pEUR                  XE5      nM      [        R                  " SU5      n[        U5      S:w  a  g US   nSU-   S-   S-   n[        R                  " X5      n	[        U	5      S:X  a  [        S	U  S
35        g  U	S   u  pn[        R                  " X5      R                  S5      n[        R                  5       n[        R                  " SSU5      nUR                  SU
5      R                  SU5      R                  SU5      R                  SU5      nUR                  X=S   US    U5      nSU;   a  g UR                  S5      nUS-   US-   S-  -   S-   nX#4$ !    g = f!    g = f)Nnn.ModuleList_gradient_checkpointing_func'(self\.[^\s]{1,}) = .*?nn\.ModuleList\(r   r   zfor ([^\s]{1,}) in z\:[\n]z+([\s]{4,})hidden_states = \1\(([^\)]{1,})\)zUnsloth: Failed patching z with gradient checkpointingz([^\s]{1,})[\s]?\=[\s]?\1z\1LAYERMODULELIST_ITEMARGS$=r   r   r   r   %self.gradient_checkpointing = False

)r>  r?  r.   r@  *custom_gradient_checkpointing_replacementsri   ra   rb   re   rg   rB  r   replace_gradient_checkpointingrs  rh   r   )rj   rk   initr@  custom_findcustom_replacemodulelist_itemsmodulelist_itemrx  r   layerr   rQ   r   replacers                  r-   patch_gradient_checkpointingr    s   !!&//2d"4$$V^^4%0 (R#//+> (R zz"LdS
!$&q)O 	09<6	7  ::f&D
4yA~)&1MNOq'E499V%**1-D-335H 66.t<D	%	 ):O!L		wwsF3  oog1gQ8(CG d{YYuF$;&1*++.WWD=[ 4 4s    F>  G >GG	srcr  c                   ^^^	 [         R                  " S[         R                  " U5       S[         R                  " U5       S3[         R                  5      nUR	                  U 5       Vs1 s H  o3R                  S5      iM     nnU(       d  U $ [         R                  " S[         R                  5      mS[        S[        4U4S jjm	U H  m[         R                  " S	[         R                  " T5       S
3[         R                  [         R                  -  [         R                  -  5      nS[         R                  S[        4UU	4S jjnUR                  X`5      n M     U $ s  snf )Nz8for (?:[^\s,]+,\s*)?(?P<layer>\w+)\s+in\s+(?:enumerate\(z\)|z)\s*:r  z(^|,)(\s*)([A-Za-z_]\w*)\s*=\s*rQ   r4  c                 (   > TR                  SU 5      $ )Nz\1\2)rh   )rQ   kw_at_start_patterns    r-   strip_kw_names2strip_kw_from_module_calls.<locals>.strip_kw_names7  s    "&&w55r0   z<
            (^[ \t]+)
            (\w+)\s*=\s*
            zj
            \(
                (
                    [^)]*?
                )
            \)
            mc                    > U R                  S5      U R                  S5      U R                  S5      p2nT" U5      nU U ST SU S3$ )Nr   rX   rn  r   r   r   )r   )r  indentoutvarrQ   new_argsr  r  s        r-   replace_call0strip_kw_from_module_calls.<locals>.replace_callI  sM    #$771:qwwqz1771:DF%d+HXfXSq
!<<r0   )ra   rU  r  rd   r   r   r   rc   VERBOSEMatchrh   )
r  r  for_patternr  
layer_varscall_patternr  r  r  r  s
          @@@r-   strip_kw_from_module_callsr  (  s4   **))O45S?9S8TTY	[
K
 -8,@,@,EF,Eq'''",EJF
***

6S 6S 6 zz YYu 	 LL299$rzz1
	=BHH 	= 	= 	=
 |1) , JE Gs   +E#c                 ~   [         R                  " UR                  5      nSU;  a  g [         R                  " UR                  5      nSU;   a  g [        R
                  " SU5      n[        U5      S:w  a  g US   n[        X55      nUR                  S5      nSU;  a  US-   US	-   S
-  -   S-   nX#4$ !    g = f!    g = f)Nr  r  r  r   r   r   zself.gradient_checkpointing =r   r   r   r  )	r>  r?  r.   r@  ra   rb   re   r  r   )rj   rk   r  r@  r  r  r   s          r-   )patch_gradient_checkpointing_layer_callerr  R  s    !!&//2d"4$$V^^4%0zz"LdS
!$&q)O(BGYYuF&d2d{fqjC//2[[= 4 4s    B1  B8 1B58B<z
attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
attention_mask_tensor = (1.0 - attention_mask_tensor).int()
a=  
if attention_mask_tensor.dtype == torch.bool:
    attention_mask_tensor = attention_mask_tensor.int()
elif torch.is_floating_point(attention_mask_tensor):
    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
c                     [         R                  " [        5      R                  5       n[         R                  " [        5      R                  5       nU(       a  U(       d  U$ UR                  S5      S   nU(       d  U$ UR                  S5       H2  nXE;   d  M
  US [        U5      [        UR                  5       5      -
   n  O   U$ [         R                  " X&5      n[         R                  " X65      nUR                  Xx5      $ !   Us $ = f)Nr   r   )
textwrapdedentDTYPE_MISMATCH_FINDrs  DTYPE_MISMATCH_REPLACEr   re   lstripr  ri   )	rj   rk   	old_block	new_block
first_lineliner  indented_oldindented_news	            r-   )patch_finfo_attention_mask_dtype_mismatchr  t  s    OO$78>>@	OO$:;AAC		M__T*1-
MLL&D!=s4y3t{{}+==> '
 My9y9~~l99s$   AD "D ?D -D <D DzM(\brouting_weights\s*=\s*routing_weights\.to\(\s*)hidden_states(\.dtype\s*\))z\1router_logits\2
module_clsrk   c                    0 nU R                   R                  5        H  u  p4[        U[        [        45      (       a  UR
                  nO$[        U[        R                  5      (       a  UnOMQ  [        R                  " U5      n[        R                  " [        [        U5      u  pgUS:  d  M  XbU'   M     [        R                  " [        [        U5      U4$ )Nr   )__dict__r&  rI  staticmethodclassmethod__func__typesFunctionTyper>  r?  ra   subn MOE_ROUTING_WEIGHTS_CAST_PATTERN MOE_ROUTING_WEIGHTS_CAST_REPLACErh   )r  rk   new_route_sourcesr\  objfuncnew_route_sourcereplaced_counts           r-   patch_moe_routing_weights_castr    s    &//557cL+677<<DU//00D",,T2+-773SUu  xH  ,I(A-=k* 8 6624TV\]_pppr0   a  
torch_addmm = torch.addmm
torch_add   = torch.add
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
    # Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
    # by _cast_input_dtype when autocast is disabled
    target_dtype = result.dtype
    xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
    # output = result + scaling * xA @ lora_B.weight.t()
    shape = result.shape
    output = torch_addmm(
        result.view(-1, shape[-1]),
        xA.view(-1, xA.shape[-1]),
        lora_B.weight.to(target_dtype).t(),
        alpha = scaling,
        beta = 1,
    ).view(shape)

    bias = lora_B.bias
    if bias is not None:
        output = torch_add(
            output,
            bias.to(target_dtype),
            alpha = scaling,
        )
    return output
pass

a$  
torch_addmm = torch.addmm
torch_add   = torch.add
torch_float16 = torch.float16
# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
    xA = dropout(x.to(torch_float16)) @ lora_A.weight.to(torch_float16).t()
    # output = result + scaling * xA @ lora_B.weight.t()
    shape = result.shape
    output = torch_addmm(
        result.view(-1, shape[-1]).to(torch_float16),
        xA.view(-1, xA.shape[-1]),
        lora_B.weight.to(torch_float16).t(),
        alpha = scaling,
        beta = 1,
    ).view(shape)

    bias = lora_B.bias
    if bias is not None:
        output = torch_add(
            output,
            bias.to(torch_float16),
            alpha = scaling,
        )
    return output
pass

c                   ^ [        5       nSn/ nU GH  u  pEn[        US5      (       d  M  UR                  R                  S:X  a  M6  [	        SU 3[        5       [        5       5        [        R                  " UR                  5      nUR                  S5      mUR                  S5      nSR                  U4S jU 5       5      n[        U5      nUR                  SS	5      nS
n	Sn
SnX;  a  X;  a  X;  a  O$SnUR                  X5      nUR                  X5      n UR                  SSS5      nSS/n[        R                  R!                  SS5      S:X  a#  SU;  a  SnU H  nUR                  X5      nM     OU H  nUR                  US	5      nM      UR                  SS	5      n[        U5      U:w  a  US-  n[        R                  R!                  SS5      S:X  a  ["        O[$        nS	nSUR'                  5       ;   a  UR                  SS5      nSnS	n[(        R*                  " SU5      (       a  Sn[-        U S3X-   U[/        [1        U5      5      SU SU  S3U-   S 9R2                  n[	        U S!U S"3[        5       [        5       5        GM  UR5                  U5        GM      US#::  a  [7        S$5        [        R                  R!                  S%S5      S&:X  a  [7        S$5        [7        U5        g )'Nr   r@  unsloth_forwardimport r   r   c              3   ,   >#    U  H	  oTS  v   M     g 7fr)   r;   r   s     r-   r   &patch_lora_forwards.<locals>.<genexpr>  s     6v!VW:vr   zresult = result.clone()r   z-output = lora_B(lora_A(dropout(x))) * scalingz6result = result + lora_B(lora_A(dropout(x))) * scalingzresult = result + outputz@return lora_forward(result, lora_A, lora_B, dropout, x, scaling)r6  zdef unsloth_forwardr   zx = x.to(lora_A.weight.dtype)z2x = self._cast_input_dtype(x, lora_A.weight.dtype)r   r$   ztorch.is_autocast_enabled()ziif not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)z,self._check_forward_args(x, *args, **kwargs)8bitz,result = self.base_layer(x, *args, **kwargs)zCresult = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)z
import torch._dynamo
@torch._dynamo.disable
def _call_8bit_base_layer(base_layer, x, *args, **kwargs):
    return base_layer(x, *args, **kwargs)
z\bVARIANT_KWARG_KEYS\bz~try:
    from peft.tuners.lora.layer import VARIANT_KWARG_KEYS
except ImportError:
    VARIANT_KWARG_KEYS = ['alora_offsets']
_peft_forwardztorch_compile_options = r   rY   .forward = forwardr   z@Unsloth: Not an error, but could not optimize some PEFT modules.r   r   )r   hasattrr@  r6   execlocalsglobalsr>  r?  r   r   rx   hashri   ru   r   r   COMPILED_LORA_FORWARD$COMPILED_LORA_FORWARD_forced_float32lowerra   rB  r   r   r   r  r!  rg   )torch_compile_optionsLinear_LoRA_Layerssuccesscould_not_replace_modulesfunctionparentchildrk   old_hashold1old2addri   replacementsr   compiled_lora_forwardextra_prependvariant_kwarg_importr@  r   s                      @r-   patch_lora_forwardsr    s	   /1G "#5%x++X$$(998wvh795""8#3#34U#d#6v66<  92> ?G(3#4XG^^D2F^^D2F !
  ,@
 ::>>1373>,F:4  ,G#^^G9F  , (4 (:

 <8#qLG ::>>"93?3F &9 " M&BY
B  $& yy2F;;C % *''%.DL!344LMbLccefivv o  F81UG#56	68L%,,V4] $6^ 	!|PQ	zz~~.4;PQ'(
r0   c           
          [         R                  " SSU 5      n [         R                  " SU 5      n[        U5      S:X  a  U $ U H1  u  p#pEU=(       d    UnSU SU SU SU S	3	nU R	                  X'5      n M3      U $ )
Nzhif self\.([^\(]{2,})\:\n[\s]{4,}(hidden\_state(?:s)?) \= ([^\s]{4,}) \* \2\n[\s]{4,}\2 \= residual \+ \2z-\2 = residual + \2 * (\3 if self.\1 else 1.0)z`[\s]{4,}((hidden\_state(?:s)?) \= residual \+ (?:(?:\2 \* ([^\n]{3,}))|(?:([^\n]{3,}) \* \2)))\nr   zs = z; z = torch.add(residual, z>, alpha = s) if type(s) is float else torch.addcmul(residual, z, s)
)ra   rh   rb   re   ri   )rk   r   r   hrZ  rightsri   s           r-   patch_residual_streamr  _  s    
 VV	  	9
F jj	 	
G 7|q-)0%ME1#Rs ##$# &''(c1 	
 
4 *1 	Mr0   c                    [        U 5      n[        SU 35      n UR                  n[        R                  " U5      n[        [        R                  " U5      R                  R                  5       5      S   R                  [        R                  :H  nU(       a  g [        R                  " UR                  5      n[        R                  " SU5      n[        U5      S:X  a  g SnU H  u  p[        SU
 35      n
[        [        R                  " U
R                  5      R                  R                  5       5      S   R                  [        R                  :H  nU(       d  M{  Sn[        SU
R                    SUR                    S	35        U	 S
3n[        R"                  " X S3U[        R$                  [        R&                  -  S9nM      U(       aQ  Sn[        R"                  " USU[        R$                  [        R&                  -  S9n[        R"                  " SSU5      nOg [        R                  " U5      R)                  [        R                  " U5      U5      nU$ !    g = f)Nmodeling_file.r  z-(self\.[^ ]{1,}) \= ([^\.]{1,})\._from_configr   FTzUnsloth: Patching z within z to fix gradient accumulation.z\(([^\)]{1,})\)z(\1, **kwargs)rV   zdef forward\(([^\)]{1,})\)zdef forward(\1, **kwargs)z\,[\s]{0,}\,rf  )r   r   r@  r>  r?  tuplerC  rD  rF  kind_VAR_KEYWORDr.   ra   rb   re   rg   r6   rh   rc   rd   ri   )r   rj   r  r@  rk   
has_kwargsr.   inner_classestotal_has_kwargs
call_classinner_class
regex_finds               r-   patch_gradient_accumulationr(    s    M"IN6(+,F..""7+ w((1<<CCEFrJOOSZSgSggJ$  1H JJOQYZM
=Qt%2!^K=9:7,,[-@-@ALLSSUVWYZ__cjcwcww
8";#7#7"8@QQopq#$56

|>$BFTVT]T]`b`l`lTlm &3 	2

$@&RTR[R[^`^j^jRjk f5 v&..w/@/@/I6RFMGs   "I I#c                 r    U R                  SS5      n U R                  SS5      n U R                  SS5      n U $ )Nzbif (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None:zEif self.config.get_text_config().final_logit_softcapping is not None:z)logits = logits / final_logit_softcappingzGlogits = logits / self.config.get_text_config().final_logit_softcappingz)logits = logits * final_logit_softcappingzGlogits = logits * self.config.get_text_config().final_logit_softcapping)ri   r  s    r-   fixup_fused_lm_headr*    sI    ^^lOF ^^3QF ^^3QF
 Mr0   r3   normalized_shapeweightepsc                    [        U5      S:X  d   eU R                  nU R                  [        R                  5      R                  S5      n[        R                  " USSS9nU R                  [        R                  5      [        R                  " XS-   5      -  n UbQ  U R                  [        R                  5      UR                  [        R                  5      R                  SSSS5      -  n U R                  U5      $ )Nr   rX   T)r   keepdimr  )	re   r   tor\   r   powr   rsqrtreshape)r3   r+  r,  r-  original_dtypevs         r-   
rms_norm2dr6    s      A%%%WWN	U]]"A

1!T*A	U]]ekk!'22ADD&))EMM":"B"B1b!Q"OO44r0   c                     SS K n SS KnS UR                  R                  l        [
        UR                  R                  l        U (       a  [        S5          SS Kn[        UR                  R                  5      nU Vs/ s H  nSU;   d  M  UPM     nnU Hx  n [        SU 35         [        U5      R                  n[        US5      (       a  M;  [        R                  " US	S US
9n[        SU S35        U (       d  Mj  [        SU 35        Mz        SS Kn["        R$                  " UR&                  R(                  5      n[*        R,                  " SU5      nU Hx  n	 [        SU	 35         [        U	5      R                  n[        US5      (       a  M;  [        R                  " US	S US
9n[        SU	 S35        U (       d  Mj  [        SU	 35        Mz     g !    g = f!   U (       a  [        S5         GN= fs  snf !   U (       a  [        SU 35         GM  = f!   U (       a  [        S5         GN!= f!   U (       a  [        SU	 35         M  = f!   U (       a  [        S5         g = f)Nr   c                      g)NFr;   rP   s     r-   <lambda>%compile_timm_models.<locals>.<lambda>  s    Ur0   z'Unsloth: Compiled timm.layers.fast_normz/Unsloth: Failed compiling timm.layers.fast_normActz!from timm.layers.norm_act import z;Unsloth: Failed compiling from timm.layers.norm_act import get_compiler_configT)rL  dynamicoptionsztimm.layers.norm_act.r  z'Unsloth: Compiled timm.layers.norm_act.z.Unsloth: Failed compiling timm.layers.norm_actz#class ([^ ]{1,})\(.*?nn\.Module\)\:z-from timm.models._efficientnet_blocks import zGUnsloth: Failed compiling from timm.models._efficientnet_blocks import z!timm.models._efficientnet_blocks.z3Unsloth: Compiled timm.models._efficientnet_blocks.z:Unsloth: Failed compiling timm.models._efficientnet_blocks)timmtimm.layers.fast_normlayers	fast_normis_fast_normr6  rg   timm.layers.norm_actr   norm_actr  r   r@  r   r\   rU   timm.models._efficientnet_blocksr>  r?  models_efficientnet_blocksra   rb   )
r   r  r?  normsr3   normr@  efficientnet_blocksblocksblocks
             r-   compile_timm_modelsrN    sO   E$-J*+5(!;< 	E#DKK(()!0EqUaZE0D8?@
 4j((Gw 566mmGYnoG(.@AB%%?vFG  	 	Q/%//0P0PQBDWXEDUGLM
 5k))Gw 566mmGYnoG4UG;MNO%%KE7ST " 	wE!CD 1)WX\W]^_E!BD)cdicjklQ!NPs   G AG# 'H  
G=G=H  H+AH  H  A
I #H:1AI 	I G #G:=H  HH   H7:II I-c                 b    SS K n[        R                  R                  UR                  SS9Ul        [        R                  R                  UR
                  SS9Ul        U (       a  [        S5        g! [         a1  n[        U[        U5      5        U (       a  [        S5         S nAgS nAff = f)Nr   T	recursivez)Unsloth: Disabled compiling causal_conv1dz'Unsloth: Failed compiling causal_conv1dF)	causal_conv1dr\   compilerrN  causal_conv1d_fncausal_conv1d_updaterg   r}   r   )r   rR  r   s      r-   compile_causal_conv1drV  #  s    NN""=#A#ASW"X 	& NN""=#E#ESW"X 	*!=? aQ!;=	s   A0A3 3
B.='B))B.c                     SS K n[        R                  R                  UR                  R
                  R                  R                  SS9UR                  R
                  R                  l        [        R                  R                  UR                  R
                  R                  R                  SS9UR                  R
                  R                  l        [        R                  R                  UR                  R
                  R                  R                  SS9UR                  R
                  R                  l	        U (       a  [        S5        g!   U (       a  [        S5         g= f)Nr   TrP  z%Unsloth: Disabled compiling mamba_ssmz#Unsloth: Failed compiling mamba_ssmF)	mamba_ssmr\   rS  rN  opstritonssd_combinedmamba_chunk_scan_combined mamba_split_conv1d_scan_combinedselective_state_updaterg   )r   rX  s     r-   compile_mamba_ssmr_  6  s   NN""$$11KK  #  	))C NN""MM  --NN #  	))J NN""$$;;RR  #  	33J
 "9;!79s   EE E*)ParallelExpertsGraniteMoeHybridMoEGraniteMoeHybridMambaLayer	GptOssMLPGptOssExpertsGemma3nTextModelWhisperDecoder
model_typesdpa_dynamic_masksdpa_bool_maskssdpa_gqa_replacesdpa_dynamic_compilecompile_attentiondisable_causal_maskscompile_torch_modulescompile_custom_modulescompile_function_callsfuse_lm_headgradient_checkpointingmanual_replacementsfast_lora_forwardsfast_residual_streamaccurate_accumulationepilogue_fusionmax_autotuneshape_padding
cudagraphsdebugrL  import_from_cacherN  return_logitssupports_sdpac                 9  ^z SSK Jn   UR                  SU  35      nU=(       d#    [        R                  R                  SS5      S:H  n[        R                  R                  SS5      S:H  nU(       a  SnU(       a  [        S	5      e S
U  SU  3n [        SU 3[        5       5        [        U5      n[        US5      (       aG  [        US5      (       a5  Ub2  [        U5      [        L a  [        U5      S:X  d   eUR                  US'   g  [        S[        5       [!        5       5        [        S[        5       [!        5       5         SS KnSSKJmz  Uz4S jn U UR(                  R*                  l         [        R                  R                  SS5      S:H  n![        R                  R                  SS5      S:H  n"[        R                  R                  SS5      S:H  n#[        R                  R                  SS5      S:H  n$[/        UUUU!UU"U$SSSSSS9n%[1        U$U%5        [3        U$5      n&[5        U$5      n'U(       d  SOSn(S[        R                  ;  a  U([        R                  S'   O[        R                  S   S:H  n( U(       a  SOSn)S[        R                  ;  a  U)[        R                  S'   O[        R                  S   n) U)S:H  n)U(       d  U(       a  [-        S5        [7        U%5         SUl        [;        U5      n*[<        R>                  " U5      n+[        [@        RB                  " U*5      [@        RD                  " U* V,s/ s H  n,U+RG                  U,5      PM     sn,5         5      n*U*RI                  5       n-[K        U+5        U&(       d  SU+;   d  SU+;   a  [-        S 5         U'(       d  S!U+;   d  S"U+;   d  S#U+;   a  [-        S$5         [L        RN                  " S%U+5      n.S&S'RQ                  [L        RN                  " S%U+5      5      -   S(-   n/[L        RN                  " S)U/-   S*-   U+5      n0[        [R        RU                  U.U0-   5      5      n.U* V,s/ s H  n,U,U.;  d  U(       a	  U(       a  M  U,PM     n*n,[L        RN                  " S+U+5      n1/ n2U. H5  n3[        S,U3 35      n4[        U4S-5      (       d  M$  U2RW                  U35        M7      U2n./ n5/ n6/ n7/ n8U. H  n3[        S,U3 35      n4[<        R>                  " U45      n4S.U4;   a  U5RW                  U35        M@  S/U4;   d  S0U4;   a  S1U+;  a  U6RW                  U35        Me  S2U4;   d  S3U4;   d  S4U4;   a  S5U4;   a  M  U7RW                  U35        M  S6U4;   d  M  U8RW                  U35        M      [Y        U6U7-   U5-   5      n9U. V,s/ s H  n,U,U9;  d  M  U,PM     n.n,Sn:Ubc  [        U5      [        L a  [        U5      S:X  d   eS7U+;   a  S1U+;  a  US   S:w  a  SUS'   O%[        U65      S:w  a  US   S:w  a  SUS'   OSUS'   Sn: U:Ul        / n;U* H  n<[L        RN                  " S8[L        RZ                  " U<5      -   U+[L        R\                  S99n=[L        RN                  " S:[L        RZ                  " U<5      -   S;-   U+[L        R\                  S99n>[        U=5      S:w  d  M  [        U>5      S:w  d  M  U;RW                  U<5        M      U. V,s0 s H  n,U,S_M     n.n,U.R_                  5        H  n3[        S,U3 35      n4[<        R>                  " U4R`                  5      n4S<U4;   =(       d    S=U4;   (       + n[c        U35      Re                  S>5      (       a  SnU. H  n?U?U4;   d  M  U=(       a    U.U?   nM      U)(       a  UOSU.U3'   M      [L        RN                  " S?U+5      n@U@ V,s/ s H  n,U,U.;  d  M  U,U9;  d  M  U,PM     n@n,U6 V,s0 s H  n,U,S _M     n6n,/ nAU6R_                  5        GH  n3[        U S@U3 35      n4[<        R>                  " U4Rf                  5      n4SAnBSBnCU4nDU(       a7  [L        Rh                  " SCSDWD[L        R\                  [L        Rj                  -  S99nDO[        [L        RN                  " WBU4[L        R\                  S995      S:X  a(  [L        Rh                  " WBSEU4[L        R\                  S99nDU4nDO6[L        Rh                  " WCSFU4[L        R\                  S99nDWARW                  U35          WDU6U3'   GM      0 nE/ nFU(       a}  W@ Hv  n3[        U S@U3 35      n4[        U4SG5      (       d  M&  [<        R>                  " U4R`                  5      n4SnGWA H  n,U,U4;   d  M  SnG  O    WG(       d  Me  WFRW                  U35        Mx       [Y        5       nH[Y        5       nIU.Rm                  5        GH  u  n3n[        U S@U3 35      n4[        U4S-5      (       d  M*   [<        R>                  " U4R`                  5      nJ[<        R>                  " U4Rf                  5      n4SHU4;   d  SIU4;   d  SJWJ;   a   [-        SKU3 SL35        WHRo                  U35         SMU4;   d  SNU4;   a   [-        SKU3 SO35        WHRo                  U35         SPU4;   d  SQU4;   d  SRU4;   a   [-        SSU3 ST35        WHRo                  U35         SUU4;   a   [-        SSU3 SV35        WHRo                  U35         SWU4;   d  SXU4;   a   [-        SSU3 SY35        WHRo                  U35         [q        [r         V,s/ s H  n,U3Re                  U,5      PM     sn,5      (       a1  [-        SZU3 S[35        WHRo                  U35        WIRo                  U35         U(       a:  S\U4;   a4  [u        U45      nDUDU4:w  a"   [w        U3UU*SUWDS]9nK[-        S^U3 35        UKWEU3'    GM      U*[        WH5      -  n*[        U15      S:  a]  U1 HW  n3[q        [r         V,s/ s H  n,U3Re                  U,5      PM     sn,5      (       a   [-        SZU3 S[35        WIRo                  U35        MY     [        WI5      S:  a  WI H  n3 [w        U3UU*SSSa9nKUKWEU3'   M       U(       aE  U.Rm                  5        H0  u  n3nU3WH;   a  M   [w        U3UU*UUSa9nK[-        ScU3 S@35        UKWEU3'   M2       U(       a  U6Rm                  5        HF  u  n3nMU(       a  [{        U3WM5      nM  [w        U3UU*UU(       a  SOUWMS]9nK[-        SeU3 S@35        UKWEU3'   MH      U7 H%  n3 [w        U3UU*SSSa9nK[-        SgU3 S@35        UKWEU3'   M'       SnNWF H,  n3U3Re                  Si5      (       d  M  SnN[-        SjU Sk35          O    WF H  n3WN(       a  M  [        U S@U3 35      n4[        U4SG5      (       d  M/  U3Re                  Sl5      (       a  [-        SjU3 Sk35        MV  [        U S@U3 Sm3[        5       5        [-        SnU3 So35        M      U
(       a  SSpK>J?nO  [;        U5      nPUP H  n3[        (       d  [        (       d  [        (       a  M&  [        UU35      nQ[        UQ[        5      (       a  [        WQS-5      (       a  [        WQWO5      (       an   [<        R>                  " WQRf                  5      n4[        U45      nD[        UDU35      nD[        UD5      nDUDU4:w  a$   [w        U3UU*SSWDSSq9nK[-        SrU3 S@35        UKWEU3'    M       U(       at  W@ Hm  n3[        U S@U3 35      n4StU+;   a  U3[        ;   a  [        U3U45      nROM2  [        U3U45      nRWRc  MC  WRu  nJnS [w        U3UU*SSUSSUJSu9nKUKWEU3'   [-        SvU3 Sw35        Mo       W@ H  n3U3WE;   a  WEU3   n4Oz[        U S@U3 35      nT[        UTS-5      (       a!  [<        R>                  " WTRf                  5      n4OMS  [        U3U45      nDUDU4:w  a#   [w        U3UU*SSWDS]9nKUKWEU3'   [-        SvU3 Sy35         M      [        U85      S:  a  U8 H  n3[        U S@U3 35      nT[        UTS-5      (       a!  [<        R>                  " WTRf                  5      n4OMG  [        WTU45      u  nDnUUDU4:w  d  [        WU5      S:  a$   [w        U3UU*SSWDWUS{9nKUKWEU3'   [-        SvU3 S|35        M       U(       a<  [         H1  n3U3WE;   d  U3WH;   d  U3WF;   d  M  [-        S~U3 35        [        U3   WEU3'   M3       SSKOJPnV   UVR                  R                  S:w  a(  [<        R>                  " WVR                  5      nWUWUVlS        OWVR                  nW  SS KOnX[;        UXR                  5      nY/ nZUY H  n[U[WW;   d  M  WZRW                  W[5        M      [        SSRQ                  S WZ 5       5      -   S(-   [        5       5        [L        R                  " SWW5      R                  S5      S   n\UWRG                  SU\5      n]UWU\U] n^[L        R                  " SU^5      R                  S5      SS  n_[L        R                  " SUW5      R                  S5      n`SnaUaR                  S5      naSRQ                  UaS   /UaSS   V,s/ s H  n,W_U,SS  -   PM     sn,-   5      naWWR                  W^Ua5      nWSnaUaR                  S5      naSRQ                  UaS   /UaSS   V,s/ s H  n,W_U,SS  -   PM     sn,-   5      naWWR                  SUaS5      nW[L        R                  " SUW5      R                  S5      n`[L        Rh                  " SU`-   SUW[L        Rj                  S99nWUWR                  SS5      nWUWR                  SSS5      nWUWR                  SS5      nW[        UW[        5       5        [        WVlQ        U	(       Ga  [        5       nbU; GH  n3[        U S@U3 35      n<[        U<5      [        L a  [-        SU3 S35        M7   [<        R                  " U<5      nc [        WcR                  R_                  5       5      nd[<        R>                  " U<5      n4U4RG                  [c        Uc5      5      neUeS:X  a  U4RG                  S5      S-   neOWe[        [c        Wc5      5      -   neU4WeS  nf[L        Rh                  " SSUf[L        R\                  S99ng/ nhWd H  niUiWg;  d  M  WhRW                  Wi5        M      [        Wh5      S:X  a  GM5  Wh HH  nj[L        Rh                  " [L        RZ                  " Uj5      S-   S[c        Wc5      [L        R\                  S99ncMJ      SU3 3Wc-   Wf-   nc[-        SU3 S@35        U(       d  SU) SWc 3ncWcWEU3'   GM      U; H  n3U3WE;   a  M  [        U S@U3 35      n<[        U<5      [        L a  [-        SU3 S35        M>   [<        R>                  " U<5      n4 U(       a  [        U3U45      n4Snk[         H  nlUlU4;   d  M  Snk  O    Wk(       d  U(       d  SU) SU4 3n4[-        SU3 S@35        O[-        SU3 S35        SnkWb H  nmUmU3:X  d  M  Snk[-        SU3 S@35          O    Wk(       a  M  U4WEU3'   M       U(       a5  W@ H.  n3[        UU35      nDUDc  M  U3WE;   a  [-        SU3 S35        WDWEU3'   M0       / nnU- H  n3U3WE;   a  WnRW                  WEU3   5        M      SRQ                  Wn5      no [        [         SU  3UoUU*[        SU% S3-   [        -   S-   S9npU(       Ga  U(       Gdw  SSKhJhnr  Ur" 5         [         GH^  n3[        U S35      n4[        U4S5      (       d  M%  [        U4R                  U35      (       d  MB  [        SU3 35      n<[        U<S-5      (       d  Mc  [        U<Rf                  S5      (       a  M  [<        R>                  " U<Rf                  5      R                  5       n4[        U3U4UU*[        SU% S3-   SSSS9Rf                  nS[        U SU3 S3[        5       [!        5       5        [        U SU3 S3[        5       [!        5       5        WpbB  [        SU3 S3[        5       [!        5       5        [        SU3 S3[        5       [!        5       5        GMa       Wpb  U(       a  [-        SWp SU 35        g WER_                  5        H)  n3 [        U S@U3 SU3 3[        5       [!        5       5        M+      WER_                  5       ns[;        [        U 5      5      ntUt H  nu[        U S@Uu 35      n[[        U[5      [R        La  M'  W[Rm                  5        HM  u  nvnw[c        Uw5      nwSnxWs H4  nyUyWw;   a+   [        U S@Wu SWv SWy 3[        5       [!        5       5          OM6     MO     M      g !    g = f! [         a     g f = f!   [-        S5         GN3= fs  sn,f s  sn,f !    GM  = fs  sn,f s  sn,f !    GM  = fs  sn,f s  sn,f !    GM  = f!    GM  = f!    GM9  = fs  sn,f ! [x         a&  nL[-        S_U3 S`[c        UL5       35         S nLALGMl  S nLALff = fs  sn,f ! [x         a&  nL[-        SbU3 S`[c        UL5       35         S nLALGM  S nLALff = f! [x         a&  nL[-        SdU3 S`[c        UL5       35         S nLALGM  S nLALff = f! [x         a&  nL[-        SfU3 S`[c        UL5       35         S nLALGM   S nLALff = f! [x         a&  nL[-        ShU3 S`[c        UL5       35         S nLALGM  S nLALff = f!    GM  = f! [x         a%  nL[-        SsU3 S`[c        UL5       35         S nLALGNKS nLALff = f! [x         a&  nL[-        SxU3 S`[c        UL5       35         S nLALGMn  S nLALff = f! [x         a%  nL[-        SzU3 S`[c        UL5       35         S nLALGNS nLALff = f! [x         a%  nL[-        S}U3 S`[c        UL5       35         S nLALGN/S nLALff = f!   [        S5      e= fs  sn,f s  sn,f ! [x         a&  nL[-        SU3 S`[c        UL5       35         S nLALGMw  S nLALff = f! [x         a&  nL[-        SU3 S`[c        UL5       35         S nLALGM  S nLALff = f! [x         aW  nqU(       d  [        Wq5      eU$(       a1  [-        [c        Wq5      5        [-        [c        [;        Wp5      5      5        S np S nqAqGNS nqAqff = f!    GM  = f!    GN= f!    GN`= f!    GM6  = f!    GNi= f)Nr   )logging	modeling_UNSLOTH_COMPILE_DISABLEr$   r   partialTz?Unsloth: Fast residual stream optimization makes things slower!ztransformers.models.z
.modeling_r  __UNSLOTH_PATCHED____UNSLOTH_SUPPORTS_SDPA__r   z9model_logger.addFilter(HideLoggingMessage('`use_cache`'))z<model_logger.addFilter(HideLoggingMessage('compile_config')))tqdmc                     > SUS'   T" U 0 UD6$ )NzUnsloth: Compiling kernelsdescr;   )rQ   rR   r  s     r-   replaced_tqdm3unsloth_compile_transformers.<locals>.replaced_tqdm  s    9F6N(((r0   z=Unsloth: Failed editing tqdm to replace Inductor Compilation:UNSLOTH_COMPILE_DEBUGUNSLOTH_COMPILE_MAXIMUMUNSLOTH_COMPILE_IGNORE_ERRORSr   F)rw  rx  ry  r{  rz  coordinate_descent_tuningr  combo_kernelsgroup_fusionmemory_planningmulti_kerneluse_block_ptrUNSLOTH_RETURN_LOGITSUNSLOTH_FULLGRAPHz(Unsloth: Patching LoRA to make it fasterrT  rU  z**********
Unsloth: Please install `causal_conv1d` to speed up Mamba training via `pip install causal_conv1d`
If you don't, training will still work, just might be slower for Mamba type models.
**********
r\  r]  r^  z**********
Unsloth: Please install `mamba_ssm` to speed up Mamba training via `pip install mamba_ssm`
If you don't, training will still work, just might be slower for Mamba type models.
**********
z class ([^\s]{1,})\(.+?\.Module\)z(?:|r   zclass ([^\s]{1,})\(rg  z'class ([^\s]{1,})\(.+?PreTrainedModel\)r  r@  r  r_   ALL_ATTENTION_FUNCTIONSz_supports_sdpa = Falseznn.functional.softmaxflash_attn_varlen_func_flash_attention_forwardz
torch.topkzrouting_weights.toz_supports_sdpa = Truez\bdef[\s]{1,}rV   z[\s]{1,}z\(.+?\)z	nn.Linearr  VisionEmbeddingszclass ([^\s]{1,})\(.+?\)rY   z(is_causal \= True if (.+?\_mask) is None and q_len \> 1 else False[\n\s]{1,})([A-Za-z0-9\_]{1,}[\s]{1,}\=[\s]{1,}[A-Za-z\.]{1,}scaled\_dot\_product\_attention)(.+?attn\_mask[\s]{0,}\=[\s]{0,})\2(.+?is\_causal[\s]{0,}\=[\s]{0,})is\_causalz9(\=[\s]{1,}[A-Za-z\.]{1,}scaled\_dot\_product\_attention)rZ   r[   z\1\3\4None\5Truez.= disable_compile_scaled_dot_product_attention_update_causal_maskattn_weightszself.self_attn_ATTENTION_CLASSESzUnsloth: Will not compile z0 since it looks like it calls attention modules!zself.encoderBaseModelOutputz& since it looks like a vision encoder!ztorch.arange(ztorch.zeros(ztorch.ones(z#Unsloth: Failed compiling function z  since array creations are done.zfor layer in self.z since it looks like a decoder!znn.functional.padpaddingz since there is padding done.zUnsloth: Disabling compile for z! since it's marked for disabling.residual)rL  rN  rM  z$Unsloth: Faster residual stream for z'Unsloth: Failed faster residual stream r=  )rL  rN  z&Unsloth: Failed disabling modules for zUnsloth: Compiled module zUnsloth: Failed compiling z"Unsloth: Fast Attention patch for z)Unsloth: Failed Fast Attention patch for z"Unsloth: Slow Attention patch for z%Unsloth: Failed Slow Attention patch )r  Gemma3Modelz)Unsloth: Will not remove causal mask for z since it's a VLM!r  z,._update_causal_mask = no_update_causal_maskz!Unsloth: Removed causal mask for z to reduce memory usage.)GenerationMixin)rL  rN  rM  rO  z3Unsloth: Fast fused linear cross entropy patch for z6Unsloth: Failed Fast fused linear cross entropy patch z(GradientCheckpointingLayer))rL  rN  rM  rO  rP  zUnsloth: Patched z! by adding gradient checkpointingz-Unsloth: Failed gradient checkpointing patch z1 by fixing finfo dtype mismatch in attention maskz<Unsloth: Failed fixing finfo dtype mismatch in attention in )rL  rN  rM  rQ  z2 by casting routing_weights to router_logits dtypezBUnsloth: Failed casting routing_weights to router_logits dtype in z Unsloth: Manual replacement for )Trainer_fast_inner_training_loopz3Unsloth: Unsuccessfully patched inner_training_loopz"from transformers.trainer import (r   c              3   $   #    U  H  ov   M     g 7fr)   r;   r   s     r-   r   /unsloth_compile_transformers.<locals>.<genexpr>  s     9PZ!Zr   z'logger\.info\([\"\'].+?Running trainingr   z\n([\s\t]{1,})z([\s\t]{1,})u^  debug_info = \
        f"==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = {len(set(p.device for p in model.parameters()))}\n"\
        f"   {chr(92)}{chr(92)}   /|    Num examples = {num_examples:,} | Num Epochs = {num_train_epochs:,} | Total steps = {max_steps:,}\n"\
        f"O^O/ {chr(92)}_/ {chr(92)}    Batch size per device = {self._train_batch_size:,} | Gradient accumulation steps = {args.gradient_accumulation_steps}\n"\
        f"{chr(92)}        /    Data Parallel GPUs = {args.world_size} | Total batch size ({self._train_batch_size} x {args.gradient_accumulation_steps} x {args.world_size}) = {total_train_batch_size:,}\n"\
        f' "-____-"     Trainable parameters = {get_model_param_count(model, trainable_only=True):,} of {get_model_param_count(model):,} ({get_model_param_count(model, trainable_only=True)/get_model_param_count(model)*100:.2f}% trained)'
        f"🦥 Unsloth needs about 1-3 minutes to load everything - please wait!"
        logger.warning(debug_info)
        import gc
        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()r      a3  n_total_devices = total_train_batch_size // \
            args.gradient_accumulation_steps // self._train_batch_size
        if n_total_devices > 1:
            logger.warning_once('Unsloth is running with multi GPUs - the effective batch size is multiplied by ' + str(n_total_devices))
        debug_info =zdebug_info =z
[\t\s]{1,}rb  r   z8train_dataloader = tpu_spmd_dataloader(train_dataloader)z:raise RuntimeError('Unsloth: TPUs are not yet supported!')_inner_training_loopzis_torch_tpu_available()FalsezUnsloth: Cannot patch z( since it's a torch.jit.script function.r  z\"\"\".+?\"\"\"z [\s]{0,}\=[\s]{0,}None[\s]{0,}\,r   zUnsloth: Fixed up function r;  z3, dynamic = True, options = torch_compile_options)
zUnsloth: Compiled function z!Unsloth: Cannot compile function z! since disabled keyword is in it.z%Unsloth: Will skip copying source of z'Unsloth: Will override already patched z  with gradient accumulation fix.r   z
torch_compile_options = r  )patch_torch_functionsz.torchr]   z
source.nn.r<  z.to(input.dtype)
)r   r!  r"  r#  z
.torch.nn.r  z.nn.zcombined_module.torch.nn.zcombined_module.nn.z3Unsloth: Exit auto compiler with combined_module = z, disable = z = combined_module.z['z'] = combined_module.)mr   r  
get_loggerru   r   r   NotImplementedErrorr  r  ModuleNotFoundErrorr   r   typer   re   r  r  torch._inductor.async_compile	torch.hubr  	_inductorasync_compilerg   r   rN  rV  r_  r  r  r   r>  r?  nparrayargsortr   copyr   ra   rb   rx   rJ  fromkeysr!  r  r  rc   rE  r.   r   endswithr@  rh   rd   r&  r  anyDISABLE_COMPILE_MODULESr  r_  r}   ro   transformers.generationr  OLD_CUDA_ARCH_VERSIONrA  OLD_TRITON_VERSIONrK  rI  
issubclassr*  rz  r  FIX_GC_LAYER_CALLER_MODULESr  r  r  r  r   transformers.trainerr  r  r6   _original_training_loopr  trainerrB  r   r   rH  r   ri   r  r   ScriptFunctionrC  rD  r  DISABLED_KEYWORDSr(  r   COMBINED_UNSLOTH_NAME_disabled_sdpa_code_cross_entropy_coder  _patch_functionsr]   rstrip_license_header){rg  rh  ri  rj  rk  rl  rm  rn  ro  rp  rq  rr  rs  rt  ru  rv  rw  rx  ry  rz  r{  rL  r|  rN  r}  r~  transformers_loggingmodel_loggerfull_disabler  r   r\   r  r  r  r  r   r  has_causal_conv1dhas_mamba_ssmr  r  r  full_sourcer3   ordered_functionstorch_modulesinherited_classinherited_modulespretrained_modulesfinal_torch_modulesrj   rk   gradient_checkpointed_modules$scaled_dot_product_attention_modulesfull_attention_modulesrouter_logit_cast_modulesremovalfinal_supports_sdpacalled_functionsr  definedcalledanother_moduleother_classes-disabled_scaled_dot_product_attention_modulescausal_mask_find!scaled_dot_product_attention_findr  all_standalone_classesremove_causal_masks
can_removebad_torch_modulesdisable_modulesr  r  r   rM  do_not_remover  r  module_classoutputr@  r  rQ  r  inner_training_loopr   items_in_trainer
good_itemsr   r   r   original_debugr   front_spaces
debug_infor'  rD  paramswherecode_sectioncleaned_code_section
bad_paramsparam	bad_parambadkeywordr(  final_all_standalone_classesall_codecombined_module	exceptionr  replaced_classescheck_dictscheckkeyrW  rm   replaced_classr  s{                                                                                                                             @r-   r   r   c  s   : =+66:,7OP Urzz~~.GMQTTLjjnn6<	IGtW!"cdd+J<z*NNw~&'3 (M}344="=>>(M*d2s=7IQ7NON#0#J#Ja  		DgiQWQYZ	GTZT\]O,"	) .;%%* 	 %'JJNN3JTW$X\_$_$&JJNN3LTW$X\_$_$&JJNN3RTW$X\_$_!$&JJNN3KTW$X\_$_5)#%%$;(  .0EF ..DE%&<=M (5C#bjj0.C

*+ "

+B Cs J  )c"**,*;

&'JJ':;)S0 +8912(,M%M"I##M2KRXXi(R[4\R[Q[5E5Ea5HR[4\)]^_I!(  , 	{	*.D.S	
 	 	$	37Y]h7h  mE  IT  mT	
 	 JJBKPMchhrzz2UWb'cddgjjO

#9O#Ke#SU`a}7H'HIJM%{Iq-)?G\dzII{ $NP[\ vh/069%%':'A'A&'I   	'M %'!+-( "vh/0''/V)V3)008,6:SW]:])<077?$.2Jf2TXrv|X|v%&--f5!V+%,,V4#  $ 	,	%	&G
 !.B1'1AQMB  M"d*s=/AQ/FGF#{29QYd9dQ5(T-*:56!;Q5(T-*:$M!"'.AM+ **-		(0CC[Z\ZcZcdK"))H*==
JKacajajkw<1V!1##H-  	 (55}!QX}M5$$&vh/0''8V$.K/V2KL	 v; 233
 I ,N'%G-*G	 , 	->	Ef' '( 	 JJ:KHM -]1-1GQAU\L\QM] =a+a<`qAdF<`(+a4616;;=(&23''7V; 	 I 	* 
JT		BLL0	J 2::.		JKqPVV$'II	
 $
VV5DII	
 >DDVL7A,V4S >T 	 #F^,AfX67F6#8998!++FOO< JB;!&J C z.55f= $ 	 eO*002	(&23vy))8	&&v7D&&v~~6F V#'76'AEY]aEa.vh6fgh!!&)V#'8F'B.vh6\]^!!&) f$&(@MU[D[7x?_`a!!&)  6)7x?^_`!!&) &(I,?7x?\]^!!&) ,CD,Cq",CDEE3F8;\]^!!&)'  J&$8.v6JV#!8&!$)"))3"J @IJ5?*62 C 3D 	'((I
"(F0GH0G1FOOA&0GHII7x?`ab##F+	 ) ?a%F
_4" %"
 2<&v. & 	 !.!4!4!6FI**HS4" )%
 1&;<1;&v. "7 	 &J&P&P&R"FN!E"" 4" )&-d3G%3
 :6(!DE1;&v.# 'S* 	 -F^4" %"
 :6(!DE1;&v. - 	 M%??FGG M=n=MM_`a	 &
 	%((&23v455x ??688=fXEWXYq(TUW^W`a1&9QRS & 	 ;m$F$$(9(9=O=O"=&9L,--',	2R2RWabnp  XA  XA$..|/C/CDF 18
0VD
::F
'w%<"*%(-&*-7.2&
  STZS[[\]^9C.v6 C D 	 #F^,AfX67F-<88FvvVF 5ffE~x"MD'f4" %"%,&+#	
 2<&v.)&1RST3 $8 	+++F3F 0&:;Jz9-- **:+=+=> C66RJV#y!8&!$)"&)3"J 6@*62-fX5fgh 9  : 	
$%)/F 0&:;Jz9-- **:+=+=> 'EZQW&X#JV#s;'7!';!8&!$)"&)3&1"J 6@*62-fX5ghi 1 02 	 +F//++--8AB1Fv1N&v. , 	 -R''004OO")"3"3G4P4P"Q.AG+")"A"A 	<//0J &&
(9(9$(? !	-		9PZ9P0P	PSV	VX_XabII@BUV[[\]^_`aE

"
"65
1C(s3NYY(.9??B12FF88O-@AGGJL(J !!$'JJqM?jQRQSn-Unfqunn-UUVJ-55njQJ
 !!$'JJqM?jQRQSn-Unfqunn-UUVJ-55njRST88M+>?EEaHL&&!4b:MWYWcWcd-55BD .55#Q .55" 		gi(#<G  +-&F~.ax89H H~/.vh6^_`!(!2!28!<J *//4467F&&x0FKKJ0E{FKK$5$9E#c*o"66%!%&>L#%66*<b,XZXaXa#b J 44%%e,   :!#X'	VVIIi(+NN
OII	
 ( x:5DJ/xq9:12C1DDx  zD  yE  F -7"6*] '^ 	&F//~.ax89H H~/.vh6^_`$..x8F 8H C,f$C - :;L:M  NB  CI  BJ  KF3F81=>9&AbcdC!/ F*CA&KL	 "0
 317&v.U 'V 	 #F4]FKJ!8//?xGghi-7"6* $ 	 $& #++(//0Fv0NO $ 	{{78H-$%Qzl3#,-B,C2FG#$&*+	
$ W@&&F.!18964(((699f--xj12H8Y//x'')>??&&x'7'78??AF)	#01F0GrJK-!$) g  N#:fX5GH')U[U]^.)fX5GH')U[U]^*1&9KLgiY_Yab1&9KLgiY_Yab7 '8 	,COCTT`ah`ijk )--/	N#1VH,?xH')U[U]^ 0
 	 .224dn-/0K~&aw/0:T!8**,JCJEE"2!U*/qr#>STbScdfmfoqwqyz  #3  ' 	# $ 	
_  2OMNr 5]L |. 	* C: 6 	* ^ ,b 	f H. 	D E* ! CF8>Z]^_Z`Yabc I   _>vhnUXYZU[T\]^^_&  S26(.QQRRS2  A&X[\]X^W_`a"  ^=fX^TWXYTZS[\]]^T( % w VW]V^^lmpqrmsltuvvwF  fEfX^\_`a\b[cdeef< ! yXY_X``norstounvwxxy: ! ^_e^fftuxyzu{t|}~~4RPQQ> .V .VF ! 26(.QQRd ! 26(.QQRD  y))!#i.!#c/*+, H( D 	&s  Ae> Af +Af Af(Af-.Af-(Af2
Af;*Af;Ag  Ag3
AgAg	AgAg
 Ag= Ag!"A Ag*Ag3
!Ag8%Ah+
Ah0!Ai#+Aj!Ak	8 Ak<#Al=$Al7"Am*A4#AnC(AAo D*Ao IAo
J6Ao#
OAo(VApZ<+Aq \Ar2` #Ar;a!Asb%Ase(Ase>Aff
AffAffAf%f2Af8gAggAgg!Ag'g*Ag0g8
Ah(hAh#h#Ah(h0
Ai h:AiiAi i#
Aji-AjjAjj
Akj AkkAkk	
Ak9kAk4k4Ak9k<All
Al4lAl/l/Al4l7
Am'mAm"m"Am'm*
Anm4AnnAnn
Aon&AooAooAoo(
Apo2AppApp
Aqp%AqqAqq
Ar/qAAr*r*Ar/r2Ar8r;As sAssAssAs	)F)r   r   TF)FNFFNNr)   )Ngh㈵>)llamaTTTTTTTTTTTTTFTTFTFFTFFFN)__all__typingr   r   r	   r
   r   r   r   r   r>  ra   r  importlib.utilnumpyr  ru   r\   
subprocessr  timer  ry   r  r  utilsr   r   r   r   r   logr   rZ  rt  
peft_utilsr   importlib.metadatar   r  	functoolsr   r   r   temporary_patches.commonr   hf_utilsr   jitr  r  r  getenv_locr   rt   __version__rA  majorminorr   get_device_capabilityr  r  r  	find_specUNSLOTH_STUDIO_ENABLEDr   r   Filterr&   r  r  r  r  r  rS   ro   r   r   	lru_cacher   r   r   r   r   r  r   r   r_  r  r`  cross_entropy_find_1ri   cross_entropy_replacement_1cross_entropy_find_2cross_entropy_replacement_2cross_entropy_find_3cross_entropy_replacement_3rr  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r(  r*  Tensorintfloatr6  rN  rV  r_  r  r  boolr   r   r;   r0   r-   <module>r(     sf  " J I I  	    	       
      . ;  8  ? 1YY__((77N 2  WY.99/6D#' #;  !  E--.1AA  	&::335LE5"aZ8eaiE!E! V//0773CC  >>,-5"ZZ^^,EsKsR B B   & ' #* #J ++ +, + \  8,Z .. .  Q  , 
:v : "VF   Sh  { 	{x 9 v B &@@ G%'>?A D N\ G%'>?] ` &:t G%'>?u z 686868
yt 3	h 	 : !!. *D" 0b (C (# (# (T&  
 0 #s  #7  qs qC qE#tTWY\T\~J]D^ q   >( $:{x )T +X    &*	 || 3i  U\\"  
	  >	~ " 6     $+$($($($($($($($($($($($($($)$($($)$($)$)$($)$)$)$(5o o!o "o "	o
 "o "o "o "o "o "o "o "o "o "o "o  "!o" "#o$ "%o& "'o( ")o* "+o, "-o. "/o0 "1o2 "3o4 "5o` q^Ns   *M9 9N 