
    oik#                        S SK r S SKJr  S SKrSSKJrJrJr  Sr	Sr
Sr\	\\
-  -
  r\ R                  S\R                  S\R                  4S	 j5       rS
 r\ R                  S\R                  S\R                  4S j5       rS r\ R                  S\R                  S\R                  4S j5       rS r\ R                  S\R                  S\R                  4S j5       rS rg)    N   )calculate_settingstriton_tanhtorch_gpu_devicel           i   
BLOCK_SIZELONG_INDEXINGc                    [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  n[         R                  " X-   USS9R                  [         R                  5      n	[         R                  " X-   USS9n
SU	-  [         R                  R                  [         R                  R                  S5      U	-  5      S-   -  nUR                  U
R                  5      nX-  n[         R                  " X'-   XS9  g )Nr   maskother      ?       @      ?r   )tl
program_idtoint64arangecastloadfloat32matherfrsqrtdtypestore)egh
n_elementsr   r	   	block_idxoffsetsr   e_rowg_rowf_rowh_rows                O/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/geglu.py_exact_forward_kernelr*      s    a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCD GGAKa8;;BJJGEGGAKa8E%K277;;rww}}S'9E'ABSHIEHHU[[!EME HHQ[%-    c           
      >  ^ U R                   u  p#nU R                  5       mU R                  n[        R                  " X#U4U R
                  US9nU4S jn[        U5         [        U   " U UUT[        T[        ::  a  SOSS9  S S S 5        U$ ! , (       d  f       U$ = f)Nr   devicec                 :   > [         R                  " TU S   5      4$ Nr   tritoncdivmetar"   s    r)   <lambda>,geglu_exact_forward_kernel.<locals>.<lambda>D       Zl1CDFr+   r   r   r   r	   )
shapenumelr.   torchemptyr   r   r*   r   INT32_SAFETY_BUFFER	gateupbatchseq_lenhdr.   outgridr"   s	           @r)   geglu_exact_forward_kernelrG   ?   s    EBJ[[F
++ur*DJJ
PCFD	&	!d##!+/B!BA	
 
" J 
"	! J    #B
Bc                 v   [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  n[         R                  " X-   USS9n	[         R                  " X-   USS9R                  [         R                  5      n
[         R                  " X'-   USS9nS[         R                  R                  [         R                  R                  S5      U
-  5      S-   -  nX-  nUR                  U	R                  5      nX-  nX-  nX-  nSnUUU
-  [         R                  " SU
-  U
-  5      -  -   nUR                  [         R                  5      U-  nUR                  U	R                  5      n[         R                  " X-   XS9  [         R                  " X-   XS9  [         R                  " X'-   UUS9  g	)
z
f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
h = f * up

df/de (with help of Wolfram :)
df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)

Reuse via
f =        1/2 * (1 + erf(1/sqrt(2) * e)) * e
r   r   r   r   r   gQ63E?g      r   N)r   r   r   r   r   r   r   r   r   r   r   r   expr   )DWr   r    r"   r   r	   r#   r$   r   DW_rowr%   r&   f_partial_rowr'   r(   df_rowdg_rowtdf_dede_rows                       r)   _exact_backward_kernelrS   Q   s   & a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCDWWR\$:FGGAKa8;;BJJGEGGAKa8E 277;;rww}}S'9E'ABSHIM!EHHV\\"EME^F^F 	AAIte|e/C(DDDEYYrzz"U*FYYv||$F HHR\5.HHQ[&.HHQ[&.r+   c           
         ^ UR                   u  p4UR                  5       mU4S jn[        UR                  5         [        U   " U UUT[
        T[        ::  a  SOSS9  S S S 5        XU4$ ! , (       d  f       N= f)Nc                 :   > [         R                  " TU S   5      4$ r0   r1   r4   s    r)   r6   -geglu_exact_backward_kernel.<locals>.<lambda>   r8   r+   r   r   r9   )r:   r;   r   r.   rS   r   r>   rK   r   r    batch_seq_lenrD   rF   r"   s         @r)   geglu_exact_backward_kernelrY      sm    MJFD	!((	#t$#!+/B!BA	
 
$ !8O 
$	#   #A**
A8c                    [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  nSn	[         R                  " X-   USS9R                  [         R                  5      n
[         R                  " X-   USS9nSU
-  [        X-  SSU
-  U
-  -   -  5      S-   -  nUR                  UR                  5      nX-  n[         R                  " X'-   XS9  g )Nr   Q63E?r   r   r   Hm?r   r   r   r   r   r   r   r   r   r   r   r   )r   r    r!   r"   r   r	   r#   r$   r   sr%   r&   r'   r(   s                 r)   _approx_forward_kernelr`      s%    a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCD
 	AGGAKa8;;BJJGEGGAKa8E 	e{19h6F6N0N#OPSVVW 
 HHU[[!EME HHQ[%-r+   c           
      >  ^ U R                   u  p#nU R                  5       mU R                  n[        R                  " X#U4U R
                  US9nU4S jn[        U5         [        U   " U UUT[        T[        ::  a  SOSS9  S S S 5        U$ ! , (       d  f       U$ = f)Nr-   c                 :   > [         R                  " TU S   5      4$ r0   r1   r4   s    r)   r6   -geglu_approx_forward_kernel.<locals>.<lambda>   r8   r+   r   r   r9   )
r:   r;   r.   r<   r=   r   r   r`   r   r>   r?   s	           @r)   geglu_approx_forward_kernelrd      s    EBJ[[F
++ur*DJJ
PCFD	&	!t$#!+/B!BA	
 
" J 
"	! JrH   c                 "   [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  n[         R                  " X-   USS9n	[         R                  " X-   USS9R                  [         R                  5      n
[         R                  " X'-   USS9nSnX-  nUS-  U
-  U
-  nS[        X-   5      -   nSU-  nU* US-
  -  USU-  -   -  nUU-   nUU
-  nUR                  U	R                  5      nUU-  nU	U-  nX-  nUR                  [         R                  5      U-  nUR                  U	R                  5      n[         R                  " X-   UUS	9  [         R                  " X-   UUS	9  [         R                  " X'-   UUS	9  g
)a  
f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
h = f * up

df/de (with help from https://arxiv.org/pdf/2305.12073.pdf :))
df/de = 1/2 * [1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )] +
        1/2 * sech^2 [   sqrt(2/pi) * x * (1 + 0.044715 * x^2 )  ] *                            ( sqrt(2/pi) * x * (1 + 0.044715 * x^2 * 3 ) )

Notice sech^2(x) = 1 - tanh^2(x)
So reuse tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )

See https://www.desmos.com/calculator/nqprfoni6x
r   r   r\   r]   r   r   r   g      @r   Nr^   )rK   r   r    r"   r   r	   r#   r$   r   rL   r%   r&   r_   abTT2Q2rQ   r'   r(   rN   rO   rR   s                           r)   _approx_backward_kernelrk      s   . a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCDWWR\$:FGGAKa8;;BJJGEGGAKa8E 	A		A	Huu$Ak!%  A	qB
CAaK	(BGE JEHHV\\"EEMEe^F^FYYrzz"U*FYYv||$F HHR\5.HHQ[&.HHQ[&.r+   c           
         ^ UR                   u  p4UR                  5       mU4S jn[        UR                  5         [        U   " U UUT[
        T[        ::  a  SOSS9  S S S 5        XU4$ ! , (       d  f       N= f)Nc                 :   > [         R                  " TU S   5      4$ r0   r1   r4   s    r)   r6   .geglu_approx_backward_kernel.<locals>.<lambda>  r8   r+   r   r   r9   )r:   r;   r   r.   rk   r   r>   rW   s         @r)   geglu_approx_backward_kernelro     sm    MJFD	!((	#%#!+/B!BA	
 
$ !8O 
$	#rZ   )r2   triton.languagelanguager   r<   utilsr   r   r   NUM_INT32_ELEMENTSSAFE_INT32_BUFFER_MULTIPLIERr   r>   jit	constexprr*   rG   rS   rY   r`   rd   rk   ro    r+   r)   <module>rx      s%          
(:8T+TT  .
 . <<. .>$ 7/
 7/ <<7/ 7/t  !.
 !. <<!. !.H$ >/
 >/ <<>/ >/Br+   