
    oi                        S SK r S SKJr  S SKrSSKJrJr  SrSr	Sr
\\
\	-  -
  r\ R                  S\R                  S\R                  4S	 j5       rS
 r\ R                  S\R                  S\R                  4S j5       rS rg)    N   )calculate_settingstorch_gpu_devicel           i   
BLOCK_SIZELONG_INDEXINGc                    [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  n[         R                  " X-   USS9R                  [         R                  5      n	[         R                  " X-   USS9n
U	[         R                  " U	5      -  nUR                  U
R                  5      nX-  n[         R                  " X'-   XS9  g )Nr   maskotherr   tl
program_idtoint64arangecastloadfloat32sigmoiddtypestore)egh
n_elementsr   r   	block_idxoffsetsr   e_rowg_rowf_rowh_rows                P/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/swiglu.py
_fg_kernelr%      s     a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCDGGAKa8;;BJJGEGGAKa8E BJJu%%EHHU[[!EME HHQ[%-    c           
      N  ^ U R                   u  p#nU R                  5       m[        R                  " X#U4U R                  U R
                  S9nU4S jn[        U R
                  5         [        U   " U UUT[        T[        ::  a  SOSS9  S S S 5        U$ ! , (       d  f       U$ = f)N)r   devicec                 :   > [         R                  " TU S   5      4$ Nr   tritoncdivmetar   s    r$   <lambda>"swiglu_fg_kernel.<locals>.<lambda>?       Zl1CDFr&   r   r   r   r   )
shapenumeltorchemptyr   r(   r   r%   r   INT32_SAFETY_BUFFER)r   r   batchseq_lenhdr   gridr   s          @r$   swiglu_fg_kernelr=   ;   s    EBJUR(!''AHHMAFD	!((	#4#!+/B!BA	
 
$ H 
$	# Hs   (#B
B$c                    [         R                  " S5      nU(       a}  UR                  [         R                  5      U-  [         R                  " SU5      R                  [         R                  5      -   n[         R
                  " U[         R                  5      nOXd-  [         R                  " SU5      -   nXs:  n[         R                  " X-   USS9n	[         R                  " X-   USS9R                  [         R                  5      n
[         R                  " X'-   USS9n[         R                  " U
5      nX-  nUR                  U	R                  5      nX-  nX-  nX-  nUR                  [         R                  5      U-  SU
SU-
  -  -   -  nUR                  U	R                  5      n[         R                  " X-   XS9  [         R                  " X-   XS9  [         R                  " X'-   UUS9  g)z
e = e.float()
se = 1.0 / (1.0 + torch.exp(-e))
f = (se * e).to(dtype)
h = f * g
df = DW * f
dg = DW * g
de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
r   r
   g      ?r   Nr   )DWr   r   r   r   r   r   r   r   DW_rowr    r!   se_rowr"   r#   df_rowdg_rowde_rows                     r$   _DWf_DW_dfg_kernelrE   L   s   $ a I,,rxx(:5		!Z8P8S8SHH9
 
 WWZ2
(299Q
+CCDWWR\$:FGGAKa8;;BJJGEGGAKa8E ZZFNEHHV\\"EME^F^FYYrzz"V+sUcFl5K/KLFYYv||$F HHR\5.HHQ[&.HHQ[&.r&   c           
         ^ UR                   u  p4UR                  5       mU4S jn[        UR                  5         [        U   " U UUT[
        T[        ::  a  SOSS9  S S S 5        XU4$ ! , (       d  f       N= f)Nc                 :   > [         R                  " TU S   5      4$ r*   r+   r.   s    r$   r0   *swiglu_DWf_DW_dfg_kernel.<locals>.<lambda>   r2   r&   r   r   r3   )r4   r5   r   r(   rE   r   r8   )r?   r   r   batch_seq_lenr;   r<   r   s         @r$   swiglu_DWf_DW_dfg_kernelrJ      sm    MJFD	!((	#4 #!+/B!BA	
 
$ !8O 
$	#s   #A**
A8)r,   triton.languagelanguager   r6   utilsr   r   NUM_INT32_ELEMENTSSAFE_INT32_BUFFER_MULTIPLIERr   r8   jit	constexprr%   r=   rE   rJ    r&   r$   <module>rS      s       7    
(:8T+TT  .
 . <<. .>" 2/
 2/ <<2/ 2/jr&   