
    oi              +       F   S SK r S SKrS SKJr  S SKrS SKrS SKJrJrJ	r	J
r
  S SKJrJr  S SKJrJrJr  \ R$                  " \5      r\ R*                  " S5      r\ R.                  " 5       r\R3                  \5        \R5                  \5        SqSqS r0 rS	 r S4S
\R@                  RB                  S\RD                  4S jjr#                S5S\RH                  S\RH                  S\%S\RH                  S\RH                  S\RH                  S\&S\&S\&S\&S\%S\%S\%S\%S\%S\&S\&S\&S\&S \&S!\RH                  4*S" jjr'               S6S#\RH                  S\RH                  S\RH                  S\RH                  S\%S\%S\%S\%S \&S\&S\&S\&S$\&S\&S\%S\%S\&S%\&S\&S\&S!\RH                  4*S& jjr(               S7S\RH                  S#\RH                  S\RH                  S\RH                  S\%S\%S\%S\%S\&S\&S$\&S\&S\&S%\&S\&S\%S\%S\&S\&S \&S!\RH                  4*S' jjr) " S( S)\RT                  RV                  5      r,S* r-S+ r.S, r/            S8S\RH                  S\RH                  S\RH                  S\%S\RH                  S\&S\&S-\S.\S/\S\&S0\&S1\&S2\&4S3 jjr0g)9    N)asdict)!_autotuned_grouped_gemm_dW_kernel!_autotuned_grouped_gemm_dX_kernel_grouped_gemm_dW_kernel_grouped_gemm_dX_kernel)&_autotuned_grouped_gemm_forward_kernel_grouped_gemm_forward_kernel)KernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForwardz@%(asctime)s::%(levelname)s,%(pathname)s:%(lineno)d:: %(message)sFc                  d    [         c$  [        R                  R                  5       S   S:  q [         $ )Nr   	   )_SUPPORTS_TMAtorchcudaget_device_capability     d/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/grouped_gemm/interface.pysupports_tmar   *   s*    

88:1=Br   c                 l   ^ ^ T [         ;  a   0 mS[        S[        4UU 4S jjnU[         T '   [         T    $ )Nsize	alignmentc                    > US:X  d   eUT;  d  TU   R                  5       U :  a3  [        R                  " U T[        R                  S9TU'   SS0TU   l        TU   $ )N   devicedtypetypeignore)numelr   emptyint8__hibernate__)r   r   stream_per_stream_tensorsr   s      r   alloc_fn4get_per_device_per_stream_alloc_fn.<locals>.alloc_fn8   sp    ###11&v.446=.3kk65::/#F+ >DX<N#F+9&v..r   )_per_device_alloc_fnsint)r   r'   r&   s   ` @r   "get_per_device_per_stream_alloc_fnr+   4   sB    ** 
	/3 
	/3 
	/ 
	/ )1f% ((r   compiled_kernelbest_configc           	          U R                   nU R                  nU R                  nU R                  n[        R                  U SU SU SU 35        Ub  [        R                  U SU 35        g g )Nz	: n_regs=z
 n_spills=z
 metadata=z autotuned best_config: )namen_regsn_spillsmetadataloggerdebug)r,   r-   kernel_namenregsnspillsr2   s         r   log_kernel_infor8   H   s{     "&&K""E&&G''H
LL-yz'*XJO }$<[MJK r   XWtopkm_sizesgather_indicestopk_weights	permute_x	permute_yfuse_mul_postautotuneBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_K	num_warps
num_stagesuse_tma_load_wuse_tma_load_xuse_tma_storeflattenr4   returnc                 6
  ^! U R                   R                  S:X  d   S5       eUR                   R                  S:X  d   S5       eU R                  5       n UR                  5       nUR                  5       nU(       a  U(       a   S5       eU(       a  U(       a   S5       eU(       a  U(       a   S5       eU=(       d    U=(       d    Un[        5       (       d#  U(       a  [        R
                  " S5        SnSnSnU(       d  U	(       a-  S	[        S
[        S[        4S jn[        R                  " U5        U R                  SU R                  S   5      n UR                  SUR                  S   5      nU(       d  U(       a  Uc   S5       eUR                  5       (       d   eUR                   R                  S:X  d   eUR                  S:X  d   eUR                  S   nUU-  nU(       a/  U R                  S   U:X  d   SU R                  S    SU S35       eOCU R                  S   U:X  d   SU R                  S    SU S35       eOU R                  S   nUU-  nUR                  S   nU R                  u  nnUR                  S   U-  nUUR                  S   :X  d   SU SUR                  S    S35       eU(       a  [        (       d  [        R
                  " S5        SqU(       d   S5       eUc   eUR                  5       U:X  d   eUR                   R                  S:X  d   eUR                  5       (       d   eUR                  S5      nU(       a-  [        SUR!                  5        SUR!                  5        35        ["        R$                  " UU4U R                   U R&                  S9nUS:X  d  US:X  a  U$ ["        R(                  R+                  S5      R,                  m!U!4S jnU	(       d$  [/        UU5      n[/        UU5      n[/        UU
5      n
U(       a^  [        SU< SU< S U< S!U< S"U< S#U
< S$U< S%U< S&U< 35        [        SUR!                  5        SXB-  R!                  5        35        0 S'U _S(U_S)U_S*U_S+U_S,U_S-U_S.U_S/U_S0U_S1U_S2T!_S3U_S4U_S5U_S6U_nU	(       d  UR1                  UUUU
UUUUS7.5        U	(       a  [2        O[4        nUU   " S80 UD6n U	(       a  [7        U UR8                  5        U$ [7        U 5        U$ )9a	  
Grouped GEMM forward pass for MoE MLPs.

The implementation offers a number of fusions specific to MoE:
- `permute_x`: fuse the permutation of hidden states from token order (original order) to grouped expert order, typically only needed for the first grouped GEMM in an MoE MLP.
    - When `permute_x` is True, `X` is expected to be of shape (num_tokens, K).
    - When `permute_x` is False, `X` is expected to be of shape (total_tokens, K) where `total_tokens = num_tokens * topk` AND already permuted to grouped expert order, i.e., hidden states are sorted such that tokens assigned to each expert are contiguous.
- `permute_y`: fused the permutation of the output from expert grouped order back to original token order, typically only needed for the second grouped GEMM in an MoE MLP.
- `fuse_mul_pre`: fuse the multiplication of the routed input with topk_weights, only done in the first grouped GEMM in an MoE MLP as for Llama4.  Do not use, since results in performance regression as it interrupts the GEMM mainloop.
- `fuse_mul_post`: fuse the multiplication of the routed output with topk_weights, used only when `permute_y` is True. NOTE: this should only be used when using this kernel for inference, not for training.

X: (M, K) hidden states where M is the num_tokens if `permute_x` is True, otherwise `total_tokens` where `total_tokens = num_tokens * topk`.
W: (E, N, K) expert weights, where E is number of experts, N in the intermediate (output) dim, and K is the reduction dim
m_sizes: tokens assigned to each expert which correspond to the size of M in the respective GEMMs in the grouped GEMM.
gather_indices: (total_tokens,) indices of tokens assigned to each expert.  E.g., slicing gather_indices by cumsum of m_sizes gives the indices of tokens assigned to each expert.
topk_weights: (total_tokens,) weights to multiply routed output by in expert MLP calculation, used only when `fuse_mul_post` is True (see note on `fuse_mul_post`).
use_fast_accum: currently unused; trade off faster accumulation dtype in GEMM for less precision.
use_tma_load_x: use TMA for loading activations, incompatible with permute_x.  TODO: add TMA gather / scatter support for Blackwell+.
use_tma_load_w: use TMA for loading weights.  If TMA supported, this should always be enabled as it is faster than global memory load.
use_tma_store: use TMA for storing output, incompatible with permute_y.  TODO: add TMA scatter support for Blackwell+.

Returns:
    y: (total_tokens, N) output of grouped GEMM
r   zX and W must be on CUDAzm_sizes must be on CUDACannot permute both X and Yz'Cannot use both TMA store and permute_yz,Cannot use both use_tma_load_x and permute_x0TMA not supported, tma_load will be set to FalseFr   r   r%   c                 J    [         R                  " U S[         R                  S9$ Nr   r   r   r"   r#   r   r   r%   s      r   r'   &grouped_gemm_forward.<locals>.alloc_fn       ;;tfejjIIr   zCgather_indices must be provided when permute_x or permute_y is True   r   zX.shape[0] (z) must match num_tokens ()z) must match total_tokens (zK (z) must match W.shape[1] (=fused_mul should only be used for inference, not for trainingTzFUSE_MUL requires PERMUTE_YDEBUG::GROUPED_GEMM  r   c                 
   > T4$ Nr   METANUM_SMSs    r   grid"grouped_gemm_forward.<locals>.grid       zr   !DEBUG::GROUPED_GEMM num_tokens =  topk =  num_experts =  N =  K =  BLOCK_SIZE_M =  BLOCK_SIZE_N =  BLOCK_SIZE_K = z permute_x = x_ptrw_ptrm_sizes_ptrgather_indices_ptrtopk_weights_ptry_ptr
NUM_TOKENSNUM_EXPERTSTOPKNKr`   	PERMUTE_X	PERMUTE_YFUSE_MUL_POSTFLATTEN)USE_TMA_LOAD_WUSE_TMA_LOAD_XUSE_TMA_STORErC   rD   rE   rF   rG   r   )r   r   
contiguousr   warningswarnr*   tritonset_allocatorviewshapeis_contiguousndim_FUSED_MUL_WARNr!   printtolistr   r"   r   r   get_device_propertiesmulti_processor_countminupdater   r	   r8   r-   )"r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r4   use_tmar'   total_tokens
num_tokensnum_experts_rv   ru   yra   kernel_argskernelr,   r`   s"                                    @r   grouped_gemm_forwardr   V   s   h 88==F"=$==">>&(C*CC(	A	A  "G iG*GG(mW.WW,LLL}??-G>>gHI(	J3 	J3 	J 	J 	X&	r1772;A	r1772;AI&	QP	Q&++----$$))V333""a'''%++A.!T)

j(Qaggaj\)B:,aPQ( 
l*Uaggaj\)D\NRSTU* wwqz!T)
--"K77DAq	
k!A
?Kc!$=aggaj\KK?MMO #O777y'''!!#|333""''6111))++++#((,&|':':'<&=Q~?T?T?V>WX 	\1%!''JAqAFjj..v6LLG 1l+1l+<60:/4);K[<LFaVSYUVTZZk\h[ll}nzm~  P  AM  @Q  Q_  S\  R`  a	
 	"7>>#3"4A~7M6U6U6W5XY	
 	 	w	
 	n 	L 	 	j 	{ 	 	Q 	Q 	7" 	Y#$ 	Y%( 	), 	7-K0 "0"0!. , , ,&(		
  	/) 
 7=Tl6Q[6QO););< H 	(Hr   dYuse_tma_load_dyfuse_mul_prec                   ^$ U(       a   S5       eU(       a   S5       eU R                  5       (       d   eUR                  5       (       d   eUR                  5       (       d   eUR                  S:X  d   eU	(       a  U
(       a   S5       eU
(       a  U(       a   S5       eU	(       a  U(       a   S5       eU=(       d    U=(       d    Un[        5       (       d#  U(       a  [        R                  " S5        SnSnSnU(       d  U(       a-  S	[
        S
[
        S[
        4S jn[        R                  " U5        UR                  S   nU R                  SU R                  S   5      n UR                  SUR                  S   5      nU R                  u  nnUR                  u  nnUU-  nUU:X  d   SU SU S35       eUU-  S:X  d   SU SU S35       eUU-  nUR                  S   nUU:X  d   SU SU S35       eUU4n[        R                  " UU R                  U R                  S9n[        R                  R                  S5      R                   m$U$4S jn U(       d$  [#        UU5      n[#        UU5      n[#        UU5      nU(       aO  [%        SU< SU< SU< SU< SU< SU< SU< S U< S!U< S"T$< 35        [%        S#UR'                  5        35        U UUUUUUUUUT$U	U
US$.n!U(       d  U!R)                  UUUUUUUUS%.5        U(       a  [*        O[,        n"U"U    " S&0 U!D6n#U(       a  [/        U#U"R0                  5        U$ [/        U#5        U$ )'a[  
dX backward kernel
grad_output: (M, N)
gather_indices: (total_tokens,), indices of tokens assigned to each expert.  E.g., slicing gather_indices by cumsum of m_sizes gives the indices of tokens assigned to each expert.
m_sizes: tokens assigned to each expert which correspond to the size of M in the respective GEMMs in the grouped GEMM.
topk: number of experts chosen per token.
`permute_x`: whether X was permuted on load in the forward pass, typically only used for the first grouped GEMM in an MoE MLP to group tokens by expert.
- In the forward pass, if we permuted X on load, we need to permute store in the backward pass
- Shapes
    - the forward pass input X shape is [NUM_TOKENS, K], reduce across K, output y is [NUM_TOKENS * TOPK, K]
    - the backward pass input dy shape is [NUM_TOKENS * TOPK, N], reduce across N, output dX is [NUM_TOKENS * TOPK, K]
- Note that in the backward pass, the output size is still [NUM_TOKENS * TOPK, K] since we still need to accumulate gradients for each expert chosen by the token in a post-processing step.
`permute_y`: whether the output was permuted on store in the forward pass, typically only used for the second grouped GEMM in an MoE MLP to restore to the original token order.
- In the forward pass, if we permuted output on store (e.g., in the second grouped GEMM in fused MoE MLP), we need to permute on load to get from token order to expert grouped order
- We still store in contiguous order since we are writing out dX which will be the input to the backwards pass of the first grouped GEMM
`fuse_mul_{pre,post}`: always set to False since this should only be used for inference.
use_tma_load_dy: use TMA for loading dy. use_tma_load_dy is incompatible with permute_y.  TODO: add TMA gather / scatter support for Blackwell+ which will enable permute_y and use_tma_load_dy.
use_tma_load_w: use TMA for loading weights.  If TMA supported, this should always be enabled as it is faster than global memory load.
use_tma_store: use TMA for storing dX.  Incompatible with permute_x.  TODO: add TMA gather / scatter support for Blackwell+ which will enable permute_x and use_tma_store.
z@fuse_mul_pre should only be used for inference, not for trainingzAfuse_mul_post should only be used for inference, not for trainingrW   rN   &Cannot use both TMA load and permute_yz'Cannot use both TMA store and permute_xrO   Fr   r   r%   c                 J    [         R                  " U S[         R                  S9$ rQ   rR   rS   s      r   r'   !grouped_gemm_dX.<locals>.alloc_fnf  s    ;;tfejjIIr   r   rV   zGrad_output N (z) must match weight N (rX   z	M_total (z) must be divisible by topk (zTotal tokens (z) must match M_total (r   r   c                 
   > T4$ r]   r   r^   s    r   ra   grouped_gemm_dX.<locals>.grid  rc   r   rd   re   z output_shape = rf   rg   rh   ri   rj   rk    NUM_SMS = rZ   )dY_ptrrm   ro   rn   dX_ptrrs   rr   rt   ru   rv   r`   rw   rx   rz   )rC   rD   rE   rF   rG   USE_TMA_LOAD_dYr{   r}   r   )r   r   r   r   r   r*   r   r   r   r   r   zerosr   r   r   r   r   r   r   r   r   r   r   r8   r-   )%r   r:   r=   r<   r;   rC   rD   rE   r4   r?   r@   rH   r   rJ   rF   rG   rK   r   rA   rB   r   r'   r   M_totalN_gradN_totalrv   ru   r   r   output_shapedXra   r   r   r,   r`   s%                                       @r   grouped_gemm_dXr   !  s   V JIJ KJK??  """"<<1 iG*GG( oX0XX.mW.WW,@@=G>>gHI(	J3 	J3 	J 	J 	X&--"K	RXXb\	"B	r1772;AhhOGVJGQ;AQ;M/&1H1MM; 	$!A	7)8a@ADJ!''*LG	~%;G9AFG
 !!$L	\BIIrxx	HBjj..  7L16<01l+0:/4);L\<MM]{N^^d`a_eekghfll}nzm~  P  AM  @Q  Qb  S_  Rc  co  el  dp  q	
 	$W^^%5$678 ," %K(  , , ,&(#2"0!.		
 3;.@WF6<Tl6Q[6QO););< I 	(Ir   c                   ^' U(       a   S5       eU(       a   S5       eU(       d)  [         R                  R                  S5      R                  OSm'U R	                  SU R
                  S   5      R                  5       n UR                  5       nUR                  5       nU(       a  U	(       a   S5       eU	(       a  U
(       a   S5       eU(       a  U(       a   S5       eU
=(       d    U=(       d    Un[        5       (       d#  U(       a  [        R                  " S	5        S
nS
n
S
nU(       d  U(       a-  S[        S[        S[        4S jn[        R                  " U5        U(       d  U	(       a  Uc   eUR                  5       (       d   eUR                  R                  S:X  d   eUR                   S:X  d   eUR
                  S   nUU-  nU(       a  U R
                  S   U:X  d   eO*U R
                  S   U:X  d   eOU R
                  S   nUU-  nUR
                  S   nU R
                  u  nnUR
                  u  nnUU:X  d   SU SU S35       e[         R"                  " UUU4U R                  U R$                  S9nU(       d$  ['        UU5      n['        UU5      n['        UU5      nU'4S jnU(       a  [)        SU< SU< SU< SU< SU< SU< ST'< 35        [)        SUR+                  5       < 35        [)        SUR+                  5       < 35        Sn[-        U5       Hd  n UUUUU    -    n!Sn"U"UU    :  aC  U!U"U"U-    n#U(       a  U#U-  n#[)        SU  SU#R+                  5        35        U"U-  n"U"UU    :  a  MC  UUU    -  nMf     U UUUUUUUUUT'UU	US .n$U(       d  U$R/                  UUUU
UUUUS!.5        U(       a  [0        O[2        n%U%U   " S"0 U$D6n&U(       a  [5        U&U%R6                  5        U$ [5        U&5        U$ )#a  
X: (M, K) hidden states where M is the num_tokens if `permute_x` is True, otherwise `total_tokens` where `total_tokens = num_tokens * topk`.
dY: (M, N)
topk: number of experts to choose per token.
m_sizes: tokens assigned to each expert which correspond to the size of M in the respective GEMMs in the grouped GEMM.
gather_indices: (total_tokens,) indices of tokens assigned to each expert.  E.g., slicing gather_indices by cumsum of m_sizes gives the indices of tokens assigned to each expert.
permute_x: whether X was permuted on load in the forward pass, typically only used for the first grouped GEMM in an MoE MLP to group tokens by expert.
- for the first grouped GEMM, we permuted on load -> X was [num_tokens, K] and stored y in expert grouped order [num_tokens * topk, K]
- in the backwards pass, we need to permute on load of X while loading dy in contiguous (expert grouped) order
- since we are writing out dW, there is no need to permute on store
permute_y: whether the output was permuted on store in the forward pass, typically only used for the second grouped GEMM in an MoE MLP to restore to the original token order.
- for the second grouped GEMM, we permuted on store -> y was permuted from expert grouped order to token order while X was loaded in expert grouped order since it was the output of the first grouped GEMM
- in the backwards pass, we need to permute on load of dy to get from token order to expert grouped order to match the order of X
- since we are writing out dW, there is no need to permute on store
use_tma_load_dy: use TMA for loading dy. use_tma_load_dy is incompatible with permute_y.  TODO: add TMA gather / scatter support for Blackwell+ which will enable permute_y and use_tma_load_dy.
use_tma_load_x: use TMA for loading x. use_tma_load_x is incompatible with permute_x.  TODO: add TMA gather / scatter support for Blackwell+ which will enable permute_x and use_tma_load_x.
use_tma_store: use TMA for storing dW.  If TMA supported, this should always be enabled as it is faster than global memory store.
zfuse_mul_pre not supportedzfuse_mul_post not supportedr   rW   rV   rN   r   z&Cannot use both TMA load and permute_xrO   Fr   r   r%   c                 J    [         R                  " U S[         R                  S9$ rQ   rR   rS   s      r   r'   !grouped_gemm_dW.<locals>.alloc_fn  rU   r   r   zdY M (z) != total_tokens (rX   r   c                 
   > T4$ r]   r   r^   s    r   ra   grouped_gemm_dW.<locals>.grid$  rc   r   z)DEBUG::GROUPED_GEMM_DW_TMA num_experts = rg   rh   ri   rj   rk   r   z.DEBUG::GROUPED_GEMM_DW_TMA m_sizes.tolist() = z5DEBUG::GROUPED_GEMM_DW_TMA gather_indices.tolist() = z(DEBUG::GROUPED_GEMM_DW_TMA Token expert z
 indices: )rl   r   rn   ro   dW_ptrrr   rt   rs   ru   rv   r`   rw   rx   rz   )rC   rD   rE   r   r|   r}   rF   rG   r   )r   r   r   r   r   r   r~   r   r   r   r*   r   r   r   r   r   r   r   r   r   r   r   ranger   r   r   r8   r-   )(r9   r   r<   r=   r;   rC   rD   rE   r?   r@   r   rI   rJ   r   rA   rF   rG   rK   rB   r4   r   r'   r   r   r   r   rv   M_gradru   dWra   m_startiexpert_token_idxt_start	token_idxr   r   r,   r`   s(                                          @r   grouped_gemm_dWr     sI   P 999;;;  	

((0FF 
 	
r1772;**,A	B  "G iG*GG(oX0XX.nW/WW-@@=G>>gHI(	J3 	J3 	J 	J 	X&I)))++----$$))V333""a'''%++A.!T)
771:+++771:---wwqz!T)
--"K77DAqIFA\!VVF83F|nTU#VV!	k1a(188QWW	MB<61l+1l+ 8+)91&!GXHYYj[gZkk|myl}  ~J  @G  K  L	
 	?GNN,<+@ABFN,A,A,C+GHI{#A-g'!*8LMGGAJ&,Ww7MN	 )T 1I>qcIL\L\L^K_` <' GAJ& wqz!G $  , "'K,  , , ,#2"0!.&(		
 3;.@WF6<Tl6Q[6QO););< I 	(Ir   c                   4    \ rS rSr\S 5       r\S 5       rSrg)GroupedGemmil  c                    X@l         X`l        Xpl        Xl        Xl        Xl        Xl        Xl        Xl        Xl	        U R                  XX55        0 nU
bx  U
R                  US'   U
R                  US'   U
R                  US'   U
R                  US'   U
R                  US'   U
R                   US'   U
R"                  US'   U
R$                  US'   ['        S
UUUUUUUUU	US	.
UD6$ )NrC   rD   rE   rF   rG   rI   rH   rJ   )
r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   r   )r;   r?   r@   rA   kernel_config_fwdkernel_config_bwd_dXkernel_config_bwd_dWrB   dX_onlydW_onlysave_for_backwardrC   rD   rE   rF   rG   rI   rH   rJ   r   )ctxr9   r:   r<   r;   r=   r?   r@   r>   rA   r   r   r   rB   r   r   
fwd_configs                    r   forwardGroupedGemm.forwardm  s   & !!) 1#7 #7  	aG<
():)G)GJ~&):)G)GJ~&):)G)GJ~&&7&A&AJ{#'8'C'CJ|$+<+K+KJ'(+<+K+KJ'(*;*I*IJ'# 
+'!!)
 
 	
r   c                 h   U R                   u  p#pEU R                  nU R                  nU R                  nU R                  n	U R
                  n
U R                  nU R                  nU R                  nU R                  nU(       d"  U(       d
  U
c   S5       eU(       d
  Uc   S5       eU	(       a   S5       eU(       d  0 nUbx  UR                  US'   UR                  US'   UR                  US'   UR                  US'   UR                  US'   UR                  US	'   UR                   US
'   UR"                  US'   [%        SUUUUUUUUS.UD6nOS nU(       d  0 nU
bx  U
R                  US'   U
R&                  US'   U
R                  US'   U
R                  US'   U
R                  US'   U
R                  US	'   U
R                   US
'   U
R"                  US'   [)        SUUUUUUUUS.UD6nUS:  a4  U(       a-  UR+                  UR,                  S   US5      R/                  SS9nOS nUUS S S S S S S S S S S S S 4$ )Nz:kernel_config_bwd_dX must be provided if autotune is Falsez:kernel_config_bwd_dW must be provided if autotune is FalserY   r   rI   rJ   rC   rD   rE   rF   rG   )r9   r   r<   r=   r;   r?   r@   rB   rH   )r   r:   r<   r=   r;   r?   r@   rB   rW   r   rV   )dimr   )saved_tensorsr;   r?   r@   rA   r   r   rB   r   r   r   rI   rJ   rC   rD   rE   rF   rG   r   rH   r   r   r   sum)r   r   r9   r:   r<   r=   r;   r?   r@   rA   r   r   rB   r   r   bwd_dW_configr   bwd_dX_configr   s                      r   backwardGroupedGemm.backward  s   (+(9(9%gxxMM	MM	))"77"77<<++++(4POP4(4POP4 	KJ	K M#/3G3W3W/02F2U2U./1E1S1So.0D0Q0Qn-0D0Q0Qn-0D0Q0Qn--A-K-Kk*.B.M.Ml+  !!/%%#  B BM#/3G3W3W/02F2U2U./1E1S1So.0D0Q0Qn-0D0Q0Qn-0D0Q0Qn--A-K-Kk*.B.M.Ml+  !!/%%#  B axIWWQWWQZr266Q6?B 
 	
r   r   N)__name__
__module____qualname____firstlineno__staticmethodr   r   __static_attributes__r   r   r   r   r   l  s+    9
 9
v f
 f
r   r   c                     U(       + nU (       a  U(       a   S5       eU(       a  U (       a   S5       eU(       a  U(       a   S5       eU(       a  U(       a   S5       eU(       a  U (       a   S5       eU(       a  U(       a  U(       a   S5       eggg)z;
Check if the configuration is valid for the forward pass.
rN   z,Cannot permute X for the second grouped GEMMz+Cannot permute Y for the first grouped GEMMz*Cannot fuse mul for the first grouped GEMMz?Cannot use TMA load and permute X unless on sm100+ (Blackwell+)z\Cannot use TMA store and permute Y for the second grouped GEMM unless on sm100+ (Blackwell+)Nr   )r?   r@   rI   rH   rJ   rA   is_first_gemmis_second_gemms           r   check_valid_config_fwdr     s     '&NiG*GG(9656  	)545  	-434  	9IHI  	)fef (6)r   c                     U(       + nU(       a   S5       eU(       a  U(       a  U(       a   S5       eU(       a  U (       a  U(       a   S5       eggg)B
Check if the configuration is valid for the backward pass of dW.
2Cannot fuse_mul is not supported for backward pass=Cannot use TMA load and permute Y for the second grouped GEMMz<Cannot use TMA load and permute X for the first grouped GEMMNr   )r?   r@   use_tma_load_dYrI   rJ   rA   r   r   s           r   check_valid_config_bwd_dWr   3  sG     '&NJJJu)UUUu~TTTu (6}r   c                     U(       + nU(       a   S5       eU(       a  U(       a  U(       a   S5       eU(       a  U (       a  U(       a   S5       eggg)r   r   r   z=Cannot use TMA store and permute X for the first grouped GEMMNr   )r?   r@   r   rH   rJ   rA   r   r   s           r   check_valid_config_bwd_dXr   H  sG     '&NJJJu)UUUu}UUUu (5}r   r   r   r   r   r   r   c                 t   U(       d  U	c   S5       e[        UUU	R                  U	R                  U	R                  UUS9  Ub4  U(       d-  [	        UUUR
                  UR                  UR                  UUS9  U
b4  U(       d-  [        UUU
R
                  U
R                  U
R                  UUS9  U(       d  U(       a
  Uc   S5       eU(       a
  Uc   S5       eU R                  SU R                  S   5      n UR                  S5      nUR                  S5      n[        R                  U UUUUUUUUU	U
UUUU5      $ )aF	  
Grouped GEMM for MoE MLPs.

The implementation offers a number of fusions specific to MoE:
- `permute_x`: fuse the permutation of hidden states from token order (original order) to grouped expert order, typically only needed for the first grouped GEMM in an MoE MLP.
    - When `permute_x` is True, `X` is expected to be of shape (num_tokens, K).
    - When `permute_x` is False, `X` is expected to be of shape (total_tokens, K) where `total_tokens = num_tokens * topk` AND already permuted to grouped expert order, i.e., hidden states are sorted such that tokens assigned to each expert are contiguous.
- `permute_y`: fused the permutation of the output from expert grouped order back to original token order, typically only needed for the second grouped GEMM in an MoE MLP.
- `fuse_mul`: fuse the multiplication of the routed output with topk_weights, used only when `permute_y` is True. NOTE: this should only be used when using this kernel for inference, not for training.

X: (M, K) hidden states where M is the num_tokens if `permute_x` is True, otherwise `total_tokens` where `total_tokens = num_tokens * topk`.
W: (E, N, K) expert weights, where E is number of experts, N in the intermediate (output) dim, and K is the reduction dim
m_sizes: tokens assigned to each expert which correspond to the size of M in the respective GEMMs in the grouped GEMM.
gather_indices: (total_tokens,) indices of tokens assigned to each expert.  E.g., slicing gather_indices by cumsum of m_sizes gives the indices of tokens assigned to each expert. Needed when either `permute_x` or `permute_y` is True.
topk_weights: (total_tokens,) weights to multiply routed output by in expert MLP calculation, used only when `fuse_mul` is True (see note on `fuse_mul`).
kernel_config_fwd: KernelConfigForward for forward pass.
kernel_config_bwd_dX: KernelConfigBackward_dX for backward pass of dX.
kernel_config_bwd_dW: KernelConfigBackward_dW for backward pass of dW.
autotune: whether to autotune the kernel, if yes, kernel_config_fwd, kernel_config_bwd_dX, and kernel_config_bwd_dW will be ignored.
is_first_gemm: whether this is the first grouped GEMM in an MoE MLP.  This is needed to check whether kernel configs are valid.  `permute_x` should only be used for first gemm; `permute_y` should only be used for second gemm.
This will impact whether TMA can be used for loading and storing.

z7kernel_config_fwd must be provided if autotune is False)rI   rH   rJ   rA   r   )r   rI   rJ   rA   r   )r   rH   rJ   rA   r   zEgather_indices is required when either permute_x or permute_y is Truez3topk_weights is required when fuse_mul_post is TruerV   )r   rI   rH   rJ   r   r   r   r   r   r   apply)r9   r:   r<   r;   r=   r?   r@   r>   rA   r   r   r   rB   r   r   r   s                   r   grouped_gemmr   ]  sw   T )	ED	E) 	.==.==-;;))	
  +G%"6"F"F!5!D!D 4 B B - -  +G%"6"F"F!5!D!D 4 B B - - I&	SR	S& $	A@	A$ 	
r1772;All2G#((,N		 r   r]   )NNFFFF    r   r         FFFTF)r   r   r   FFFFFFr   r   TFFF)r   r   r   FFFFFFFr   r   TFF)NFFNFNNNFTFF)1loggingr   dataclassesr   r   r   grouped_gemm.kernels.backwardr   r   r   r   grouped_gemm.kernels.forwardr   r	   grouped_gemm.kernels.tuningr
   r   r   	getLoggerr   r3   	Formatter	formatterStreamHandlerchsetFormatter
addHandlerr   r   r   r)   r+   compilerCompiledKernelConfigr8   Tensorr*   boolr   r   r   autogradFunctionr   r   r   r   r   r   r   r   <module>r      sS          
		8	$F	
  	    "   )* SWL__33LBH--L& $(!%  3H||H||H H \\	H
 LLH ,,H H H H H H H  !H" #H$ %H& 'H( )H* +H. /H2 3H4 \\5Hb  !)]]||] LL] \\	]
 ] ] ] ] ] ] ] ] ] ] ]  !]" #]$ %]& ']( )]* \\+]L ! )h||hh \\h LL	h
 h h h h h h h h h h h  !h" #h$ %h& 'h( )h* \\+hVd
%..)) d
Nf@U*V4 $(-14848#k||k||k \\k 	k
 LLk k k +k 2k 2k k k  !k" #kr   