
    oi&                     6   S SK r S SKJr  S SKrSSKJrJr  \ R                  S\R                  S\R                  S\R                  S\R                  S\R                  S	\R                  S
\R                  4S j5       r
S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S	\R                  S\R                  S
\R                  4S jr\ R                  " \5      r\ R                  " SS 05      " \5      r\ R                  S\R                  S\R                  S\R                  S\R                  S\R                  S	\R                  S
\R                  4S j5       r " S S\R                  R                  5      r\R"                  R$                  S&S\R&                  S\4S jj5       rS SKJr   " S S\5      r S SKJr   " S S\5      rS rS rSS \R<                  S!S"S#4S$ jrS% r g!    N#= f)'    N   )calculate_settingstorch_gpu_deviceY_row_strideX_row_strideW_row_strider_row_striden_colseps
BLOCK_SIZEc                 J   [         R                  " S5      n[         R                  " SU
5      nX:  nXU-  -  n X+U-  -  nXkU-  -  n[         R                  " X,-   USS9R	                  [         R
                  5      n[         R                  " XL-   USS9n[         R                  " X-  SS9U-  n[         R                  R                  UU	-   5      n[         R                  " UU5        UU-  nUR	                  UR                  5      nUU-  n[         R                  " X-   UUS9  g)z
Fast RMS Layernorm kernel
Inspiration from a Triton tutorial:
https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
r   maskotheraxisr   N)tl
program_idarangeloadtofloat32summathrsqrtstoredtypeYr   Xr   Wr   rr	   r
   r   r   row_idxcol_offsetsr   X_rowW_rowrow_varinv_varnormedoutputs                       W/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/rms_layernorm.py_rms_layernorm_forwardr-      s    & mmAG))Az*KD<	A<	A<	AGGAOD!<??

KEGGAOD!<EffU]1-6GggmmGcM*GHHQW_FYYu{{#Fe^FHHQ_fT2    dY_row_stridedX_row_strideGEMMAc                    [         R                  " S5      n[         R                  " SU5      nX:  nXU-  -  n XNU-  -  nXU	-  -  nU(       a  X.U-  -  nOU n[         R                  " X-   USS9R	                  [         R
                  5      n[         R                  " XO-   USS9R	                  [         R
                  5      n[         R                  " Xo-   USS9R	                  [         R
                  5      n[         R                  " U5      R	                  [         R
                  5      nUU-  nU(       a	  UUS-   -  nOUU-  n[         R                  " UU-  SS9nUU
-  U
U-  UU-  -
  -  n[         R                  " X/-   UUS9  g)z
Fast RMS Layernorm kernel for the backward pass
Inspiration from a Triton tutorial:
https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
r   r         ?r   r   N)r   r   r   r   r   r   r   r   )dYr/   dXr0   r!   r   r"   r   r#   r	   r
   r   r1   r   r$   r%   r   dY_rowr&   r'   r)   r*   dY_Wrowsum_dY_normedr+   s                            r,   _rms_layernorm_backwardr9   <   sU   , mmAG))Az*KDM
!!B<	A<	A
%%WWR%dA>AA"**MFGGAOD!<??

KEGGAOD!<??

KE ggajmmBJJ'GW_F%~vvdVmA6v$:J1J!JKFHHRvd3r.   c                     [        U S   5      $ )Nr1   )bool)argss    r,   <lambda>r=   t   s    d4=1r.   c                 T   [         R                  " S5      n[         R                  " SU
5      nX:  nXU-  -  n X+U-  -  nXkU-  -  n[         R                  " X,-   USS9R	                  [         R
                  5      n[         R                  " XL-   USS9R	                  [         R
                  5      n[         R                  " X-  SS9U-  n[         R                  R                  UU	-   5      n[         R                  " UU5        UU-  nUUS-   -  n[         R                  " X-   UUS9  g )Nr   r   r   r3   r   )
r   r   r   r   r   r   r   r   r   r   r   s                       r,   _gemma_rms_layernorm_forwardr?   y   s    " mmAG))Az*KD<	A<	A<	AGGAOD!<??

KEGGAOD!<??

KEffU]1-6GggmmGcM*GHHQW_Fus{#FHHQ_fT2r.   c            
           \ rS rSr\SS\R                  S\R                  S\S\4S jj5       r	\S\R                  4S j5       r
S	rg
)Fast_RMS_Layernorm   r!   r"   r   gemmac                    UR                   nUS   nUR                  SU5      nUR                   u  px[        U5      u  pUR                  n[        R
                  " Xx4UR                  US9n[        R
                  " U[        R                  US9nU(       a  [        O[        n[        U5         X4   " UUR                  S5      UUR                  S5      UUR                  S5      UUR                  S5      UUU	U
S9  S S S 5        X0l        Xl        Xl        X@l        U R!                  XU5        UR"                  " U6 $ ! , (       d  f       NG= f)Nr   devicer   )r   	num_warps)shapereshaper   rG   torchemptyr   r   r?   r-   r   strider   r   rH   r1   save_for_backwardview)ctxr!   r"   r   rC   rI   dimn_rowsr
   r   rH   rG   r    r#   fxs                  r,   forwardFast_RMS_Layernorm.forward   s   9IIb#  !36 :
KK(!''FKKKG-2)8Nf%yM'% & #!	aA&vvu~) &%s   .AE  
Er4   c                 z   UR                   nUS   nUR                  SU5      nU R                  u  pEnUR                   u  pxU R                  (       a  [        R
                  " U5      OUn	[        UR                  5         [        U4   " UUR                  S5      U	U	R                  S5      UUR                  S5      UUR                  S5      UUR                  S5      UU R                  U R                  U R                  U R                  S9  S S S 5        U	R                  " U6 n	U	S S S 4$ ! , (       d  f       N#= f)NrE   r   )r1   r   rH   )rI   rJ   saved_tensorsr1   rK   
empty_liker   rG   r9   rM   r   r   rH   rO   )
rP   r4   rI   rQ   r!   r"   r#   rR   r
   r5   s
             r,   backwardFast_RMS_Layernorm.backward   s   9ZZC ##a %(YYUb!Bbii(#VI.		!		!		 ^^MM! )& WWe_4t##) )(s   ?BD,,
D: NF)__name__
__module____qualname____firstlineno__staticmethodrK   Tensorfloatr;   rT   rY   __static_attributes__r[   r.   r,   rA   rA      sX    $ $ $E $$ $ $L $%,, $ $r.   rA   r!   rC   c                     U R                   n[        U S5      (       a  U R                  OU R                  n[        R                  XXB5      nU$ )Nvariance_epsilon)weighthasattrrf   r   rA   apply)	layernormr!   rC   r"   r   outs         r,   fast_rms_layernormrl      sM    &&A 9011 	""]] 
 
"
"1
4CJr.   LlamaRMSNormc                       \ rS rSrS rSrg)Unsloth_LlamaRMSNorm   c                     [        XSS9$ NF)rC   rl   selfr!   s     r,   rT   Unsloth_LlamaRMSNorm.forward   s    !$599r.   r[   Nr]   r^   r_   r`   rT   rd   r[   r.   r,   rp   rp      s    :r.   rp   )MllamaTextRMSNormc                       \ rS rSrS rSrg)Unsloth_MllamaTextRMSNormi  c                     [        XSS9$ rs   rt   ru   s     r,   rT   !Unsloth_MllamaTextRMSNorm.forward  s    %du==r.   r[   Nrx   r[   r.   r,   r{   r{     s    	>r.   r{   c                      SS K n [        U R                  R                  R                  l         SS Kn [        U R                  R                  R                  l
        g !    g = fNr   )(transformers.models.llama.modeling_llamarp   modelsllamamodeling_llamarn   *transformers.models.mllama.modeling_mllamar{   mllamamodeling_mllamary   transformerss    r,   patch_rms_layernormr   
  sX    3<PL,,99 & 	""22D
 
   -A A!c                      SS K n [        U R                  R                  R                  l         SS Kn [        U R                  R                  R                  l        g !    g = fr   )	r   rn   r   r   r   r   ry   r   r   r   s    r,   unpatch_rms_layernormr     sS    3<HL,,99GX""22D 
r      h㈵>   O    c                    SSK Jn  U" U 4US9R                  S5      n[        R                  R                  U5        [        R
                  " U5        [        R                  R                  R                  UR                  5        [        R                  " X5U 4USS9nUR                  5       n	UR                  S5        U	R                  S5        U" U5      n
[        R                  " X5U 4USSS9nU
R                  U5        UR                  R                  5       n[        Xy5      n
U
R                  U5        [        R                   " XR                  -
  5      R#                  5       S::  d   eg )	Nr   rm   )r   cudarF   T)r   rG   requires_gradg?)r   rn   r   rK   r   manual_seednninituniform_rg   randnclonerequires_grad_rY   gradrl   amaxitem)rQ   r   r   bszrandom_stateseqlenrn   rj   r!   XXr    YYcorrect_grads                r,   test_rms_layernormr   &  s    FcV3/226:I	JJ<(	l#	HHMM9++,S#&GA	
BTd!A	c3'Y]	^BJJrN66<<>L9)AJJrN::lWW,-224<<<r.   c                     S Hh  n [         R                  [         R                  4 HA  n[         R                  " SUS9   S H  nS H  n[	        U SUSUUS9  M     M     S S S 5        MC     Mj     g ! , (       d  f       MY  = f)	N)i   r      r   )device_typer   )r   r   i]  )r   *   r   r   )rQ   r   r   r   r   r   )rK   float16bfloat16autocastr   )rQ   r   r   r   s       r,   testing_suite_layernormr   B  so     mmU^^4EfeD/F(2*"%"&$)"$+7%+ )3 0 ED 5 !DDs   !A00
A?r\   )!tritontriton.languagelanguager   rK   utilsr   r   jit	constexprr-   r9   
heuristicsr?   autogradFunctionrA   compilerdisablerb   r;   rl   r   rn   rp   r   ry   r{   r   r   r   r   r   r[   r.   r,   <module>r      sf      7 #3,,#3 ,,	#3 ,,#3 ,,#3 LL#3 
#3 #3 #3L24<<24 <<	24 ,,24 ,,24 ,,24 LL24 
24 <<24 24j !**%<=  ++1 	  !3,,!3 ,,	!3 ,,!3 ,,!3 LL!3 
!3 !3 !3HH$00 H$X U\\ $   B:< :
		L>$5 >
 
MM
=8y	s   $H H