
    oiK                        S SK r S SKJr  S SKrSSKJrJr  S SKJ	r	  \ R                  S\R                  S\R                  S\R                  4S j5       r\ R                  S\R                  S\R                  S\R                  4S	 j5       r " S
 S\R                  R                  5      rS rSS\R$                  SSS4S jrS rg)    N   )calculate_settingstorch_gpu_device)patch_layernormn_colseps
BLOCK_SIZEc                 P   [         R                  " S5      n[         R                  " SU
5      nX:  nXU-  -  n X+U-  -  nXk-  nX{-  n[         R                  " X,-   USS9R	                  [         R
                  5      n[         R                  " XL-   USS9R	                  [         R
                  5      n[         R                  " X\-   USS9R	                  [         R
                  5      n[         R                  " USS9U-  n[         R                  " XU-
  S5      n[         R                  " UU-  SS9U-  n[         R                  R                  UU	-   5      n[         R                  " UU5        [         R                  " UU5        UU-  U-  U-   n[         R                  " X-   UUS9  g Nr   )maskother)axis)r   )tl
program_idarangeloadtofloat32sumwheremathrsqrtstore)YY_row_strideXX_row_strideWbrmur   r   r	   row_idxcol_offsetsr   X_rowW_rowb_rowmean_XXXrow_varinv_varoutputs                         S/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/layernorm.pylayernorm_forwardr-      sV    mmAG))Az*KD<	A<	ALAMB GGAOD!<??

KEGGAOD!<??

KEGGAOD!<??

KEVVE!$v-F	$	*BffR"WQ'&0GggmmGcM*GHHQHHR7le#e+FHHQ_fT2    c                    [         R                  " S5      n[         R                  " SU
5      nX:  nXU-  -  n X+U-  -  nXk-  nX{-  n[         R                  " X-   USS9R	                  [         R
                  5      n[         R                  " X,-   USS9R	                  [         R
                  5      n[         R                  " XL-   USS9R	                  [         R
                  5      n[         R                  " X\-   USS9R	                  [         R
                  5      n[         R                  " U5      R	                  [         R
                  5      n[         R                  " U5      R	                  [         R
                  5      nUU-
  U-  nUU-  nU[         R                  " USS9U-  -
  U[         R                  " UU-  SS9-  U-  -
  nUU-  n[         R                  " X-   UUS9  g r   )r   r   r   r   r   r   r   r   )dYdY_row_strider   r   r   r   r    r!   r   r   r	   r"   r#   r   dY_rowr$   r%   r&   r*   meannormeddY_WdX_rows                          r,   layernorm_backwardr7   A   s    mmAG))Az*KDM
!!B<	ALAMB WWR%dA>AA"**MFGGAOD!<??

KEGGAOD!<??

KEGGAOD!<??

KEggajmmBJJ'G772;>>"**%Ddlg%FE>D
&&a
 6
)	*
266$-2
2V
;	< 
 gFHHRvd3r.   c                   4    \ rS rSr\S 5       r\S 5       rSrg)Fast_Layernormm   c                    UR                   nUS   nUR                  SU5      nUR                   u  px[        U5      u  pUR                  n[        R
                  " Xx4UR                  US9n[        R
                  " U[        R                  US9n[        R
                  " U[        R                  US9n[        U5         [        U4   " UUR                  S5      UUR                  S5      UUUUUUU	U
S9  S S S 5        X@l        Xl        Xl        U R                  XX=U5        UR                  " U6 $ ! , (       d  f       NB= f)Ndtypedevicer   r	   	num_warps)shapeviewr   r?   torchemptyr>   r   r   r-   strider   r	   rA   save_for_backward)ctxr   r   r   r   rB   dimn_rowsr   r	   rA   r?   r   r    r!   s                  r,   forwardFast_Layernorm.forwardn   s   BiFF2sO 26 :
KK(!''FKKKG[[Hf%vi('% & #!aA"-vvu~' &%s   ?8D33
Ec                    UR                   nUS   nUR                  SU5      nU R                  u  pEpgnUR                   u  p[        UR                  5         [
        U	4   " UUR                  S5      UUR                  S5      UUUUU
U R                  U R                  U R                  S9  S S S 5        UR                  " U6 nUS S S S 4$ ! , (       d  f       N$= f)Nr<   r   r@   )
rB   rC   saved_tensorsr   r?   r7   rF   r   r	   rA   )rH   r0   rB   rI   r   r   r   r    r!   rJ   r   dXs               r,   backwardFast_Layernorm.backward   s    BiWWR**aBbii(y)		! ^^MM ) WWe_4tT))! )(s   AC
C N)__name__
__module____qualname____firstlineno__staticmethodrK   rP   __static_attributes__rR   r.   r,   r9   r9   m   s)     @ * *r.   r9   c                     U R                   SL d   eU R                  nU R                  n[        U S5      (       a  U R                  OU R
                  n[        R                  XX45      nU$ )NTvariance_epsilon)elementwise_affineweightbiashasattrrZ   r   r9   apply)	layernormr   r   r]   r   outs         r,   fast_layernormrb      sj    ''4///A>>D 9011 	""]] 
 

qT
/CJr.      h㈵>   O    c                 &   SSK Jn  U" U 4USUS9n[        R                  R	                  U5        [        R                  " U5        [        R
                  R                  R                  UR                  5        [        R
                  R                  R                  UR                  5        [        R                  " X5U 4USS9nUR                  5       n	UR                  S5        U	R                  S5        U" U5      n
[        R                  " X5U 4USSS9nU
R                  U5        UR                  R                  5       n[        Xy5      n
U
R                  U5        [        R                   " XR                  5      R#                  5       S::  d   eg )	Nr   )	LayerNormcuda)r   r?   r>   r=   T)r>   r?   requires_gradg?)torch.nnri   rD   rj   manual_seednninituniform_r\   r]   randnclonerequires_grad_rP   gradrb   distitem)rI   r   r>   bszrandom_stateseqlenri   r`   r   r(   r   YYcorrect_grads                r,   test_layernormr|      s#    #3&feLI	JJ<(	l#	HHMM9++,	HHMM9>>*S#&GA	
BTd!A	c3'Y]	^BJJrN66<<>Ly%AJJrN::lGG,113s:::r.   c                     S Hh  n [         R                  [         R                  4 HA  n[         R                  " SUS9   S H  nS H  n[	        U SUSUUS9  M     M     S S S 5        MC     Mj     g ! , (       d  f       MY  = f)	N)i   rc      rj   )device_typer>   )rg   r~   i]  )rf   *   rd   re   )rI   r   r>   rw   rx   ry   )rD   float16bfloat16autocastr|   )rI   r>   ry   rx   s       r,   testing_suite_layernormr      so     mmU^^4EfeD/F(2&"%"&$)"$+7%+ )3 0 ED 5 !DDs   !A00
A?)tritontriton.languagelanguager   rD   utilsr   r   unsloth_zoo.patching_utilsr   jit	constexprr-   r7   autogradFunctionr9   rb   r   r|   r   rR   r.   r,   <module>r      s        7
 $3 LL$3 
$3 $3 $3N (4 LL(4 
(4 (4 (4V:*U^^,, :*z
 
MM
;:r.   