
    oi'              )          S SK Jr  S SKrS SKrS SKJrJrJrJr  S SK	J
r
JrJrJr  S SKJrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJrJrJr  S r  SoS	 jr!               SpS\S\S\"S\"S\"S\"S\"S\"S\"S\"S\#S\#S\#S\#S\#S\"S\#S\"S\"4&S jjr$\RJ                  RM                  S\S  S!9\RJ                  RM                  S\\\/-   S" S!9\RJ                  RM                  S\S# S!9\RJ                  RM                  SS
S/S$ S!9S\S\S\S\"4S% j5       5       5       5       r'\RJ                  RM                  S\S& S!9\RJ                  RM                  S\\\/-   S' S!9\RJ                  RM                  S\S( S!9\RJ                  RM                  SS
S/S) S!9S\S\S\S\"4S* j5       5       5       5       r(\RJ                  RM                  SS+/S, S!9\RJ                  RM                  SS
S/S- S!9\RJ                  RM                  SS
S/S. S!9\RJ                  RM                  S\\/S/ S!9\RJ                  RM                  S\S0 S!9\RJ                  RM                  SS
S/S1 S!9S\S\S\"S\"S\"S\#4S2 j5       5       5       5       5       5       r)\RJ                  RM                  SS+/S3 S!9\RJ                  RM                  SS
S/S4 S!9\RJ                  RM                  SS
S/S5 S!9\RJ                  RM                  S\\/S6 S!9\RJ                  RM                  S\S7 S!9\RJ                  RM                  SS
S/S8 S!9S\S\S\"S\"S\"S\#4S9 j5       5       5       5       5       5       r*                  SqS\S\S\"S\"S:\"S\"S\"S\"S\"S\#S\#S\#S\#S\#S\#S\"S\"S\"S\"4&S; jjr+\RJ                  RM                  S\S< S!9\RJ                  RM                  S\SS \\/-   S= S!9\RJ                  RM                  S\S> S!9\RJ                  RM                  SS
S/S? S!9S\S\S\S\"4S@ j5       5       5       5       r,\RJ                  RM                  S\SA S!9\RJ                  RM                  S\SS \\/-   SB S!9\RJ                  RM                  S\SC S!9\RJ                  RM                  SS
S/SD S!9S\S\S\S\"4SE j5       5       5       5       r-\RJ                  RM                  SSF/SG S!9\RJ                  RM                  SS
S/SH S!9\RJ                  RM                  SS
S/SI S!9\RJ                  RM                  S\\/SJ S!9\RJ                  RM                  S\SK S!9\RJ                  RM                  SS
S/SL S!9S\S\S\"S\"S\"S\#4SM j5       5       5       5       5       5       r.\RJ                  RM                  SSF/SN S!9\RJ                  RM                  SS
S/SO S!9\RJ                  RM                  SS
S/SP S!9\RJ                  RM                  S\\/SQ S!9\RJ                  RM                  S\SR S!9\RJ                  RM                  SS
S/SS S!9S\S\S\"S\"S\"S\#4ST j5       5       5       5       5       5       r/               SrS\S\S\"S\"S\"S:\"S\"S\"S\#S\#S\#S\#S\#S\"S\"S\#S\"SU\"S\"S\"4(SV jjr0\RJ                  RM                  S\SW S!9\RJ                  RM                  S\\\/-   SX S!9\RJ                  RM                  S\SY S!9\RJ                  RM                  SS
S/SZ S!9 SsS\S\S\
S\"SU\"4
S[ jj5       5       5       5       r1\RJ                  RM                  S\S\ S!9\RJ                  RM                  S\\\/-   S] S!9\RJ                  RM                  S\S^ S!9\RJ                  RM                  SS
S/S_ S!9 SsS\S\S\
S\"SU\"4
S` jj5       5       5       5       r2\RJ                  RM                  SSF/Sa S!9\RJ                  RM                  SS
S/Sb S!9\RJ                  RM                  SS
S/Sc S!9\RJ                  RM                  S\\/Sd S!9\RJ                  RM                  S\Se S!9\RJ                  RM                  SS
S/Sf S!9S\S\S\"S\"S\"S\#4Sg j5       5       5       5       5       5       r3\RJ                  RM                  SSF/Sh S!9\RJ                  RM                  SS
S/Si S!9\RJ                  RM                  SS
S/Sj S!9\RJ                  RM                  S\\/Sk S!9\RJ                  RM                  S\Sl S!9\RJ                  RM                  SS
S/Sm S!9S\S\S\"S\"S\"S\#4Sn j5       5       5       5       5       5       r4g)t    )asdictN)grouped_gemmgrouped_gemm_dWgrouped_gemm_dXgrouped_gemm_forward)KernelConfigKernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForward)calculate_topkget_routing_indicespermutetorch_grouped_gemm	unpermute   )DATA_CONFIGSKERNEL_CONFIGS_FWDLLAMA_MODEL_CONFIGQWEN_MODEL_CONFIGSMALL_MODEL_CONFIGS	TOLERANCE
DataConfigKERNEL_CONFIGS_BWD_dWKERNEL_CONFIGS_BWD_dXModelConfigmake_inputsFc                    U(       + nU (       a"  U(       a  U(       a  [        SU < SU< 35        gU(       a"  U (       a  U(       a  [        SU < SU< 35        gU(       a"  U(       a  U(       a  [        SU< SU< 35        gU(       a"  U(       a  U(       a  [        SU< SU< 35        gU(       a"  U(       a  U(       a  [        SU< SU< 35        gg	)
NzSkipping test: permute_x =  permute_y = Fz
 use_W2 = zSkipping test: permute_y = 
 use_W1 = zSkipping test: fuse_mul_post = z is_backward = T)print)	permute_x	permute_yuse_W1fuse_mul_postis_backwardverboseuse_W2s          e/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/tests/test_grouped_gemm.pycheck_valid_configr)   4   s     ZFY0I>9.AB)0I>6+>?)0I>6+>?4M#5[vkBC}4M#55E{6FGH    Tdata_configmodel_configr!   r"   r#   r$   flattenuse_tma_load_wuse_tma_load_xuse_tma_storeBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_K	num_warps
num_stagesautotunenum_autotune_configsallow_tma_storeuse_autogradc                    [        X#XES9(       d&  [        R                  " SU< SU< SU< SU< 35        U	(       a  U(       d  [        R                  " S5        [        U R                  U R
                  -  UR                  UR                  UR                  UR                  U R                  S9u  nnnnnUR                  nUR                  nUR                  nU(       a  UOUnU R                  U R
                  -  nUR                  u  nnnUR                  USU-  U4:X  d   eU(       a  UOUn U(       a-  UR                  UU4:X  d   S	UR                   S
U SU 35       eO2UR                  UU-  U4:X  d   S	UR                   S
U SU SU 35       eUU-  n!U(       a  U!SU-  4OU!U4n"[        UUUUS9u  n#n$U#R                  S5      n#U$R                  S5      n$[!        U$US9u  n%n&[#        U&5      U!:X  d   e[#        U%5      U:X  d   e[$        UR                     u  n'n(['        UU&U5      n)U)n*U(       a  U)R                  U!U4:X  d$  O	U!U4(       d   SU)R                   SU! SU 35       e[)        U*U U%S9n+U(       a  Un,OU)n,U(       a  SSKJn-  Ub  U-R.                  S U U-l        U(       a4  SSKJn.  [5        U
UUUUUUUUUU	S9n/U." U,U UU%U&U(       a  U#OS UUUU/UUS9n0OK[7        S/0 SU,_SU _SU_SU%_SU&_SU(       a  U#OS _SU_S U_S!U_S"U_S#U_S$U	_S%U_S&U
_S'U_S(U_S)U_S*U_S+U_6n0U+R                  U":X  d   eU0R                  U":X  d   eU(       a  [9        U+U&5      n+U(       a+  U(       d  [9        U+U&5      n+[9        U0U&5      n0U+U#S S 2S 4   -  n+[:        R<                  " U+U0U'U(S,9(       d8   S-U+U0-
  R?                  5       RA                  5       RC                  5       S. 35       eg )0N)r#   r$   1Skipping test due to invalid config: permute_x = r   r   z fuse_mul_post = @TMA store needs to be debugged due to non-deterministic behavior)MNKEtopkdtype   	X.shape: , num_tokens: , K: , topk: , N: use_sigmoidrenormalizenum_expertszXperm.shape: , total_tokens: XWm_sizesr   &_autotuned_grouped_gemm_forward_kernelr   )r1   r2   r3   r4   r5   r!   r"   r$   r.   r/   r0   )rQ   rR   rA   rS   gather_indicestopk_weightsr!   r"   r$   kernel_config_fwdr6   is_first_gemmrQ   rR   rA   rS   rW   rX   r!   r"   r$   r.   r/   r0   r6   r1   r2   r3   r4   r5   r-   atolrtolzGrouped gemm forward failed: .6f )"r)   pytestskipr   bsseq_lenintermediate_sizehidden_sizerN   rA   rB   rJ   rK   shaper   viewr   lenr   r   r   grouped_gemm.kernels.forwardrU   configsgrouped_gemm.interfacer   r   r   r   torchallcloseabsmaxitem)1r+   r,   r!   r"   r#   r$   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   X1X2W1W2gating_outputrA   rJ   rK   rQ   
num_tokensr@   r?   r>   rR   total_tokensoutput_shaperX   topk_idsexpert_token_countsrW   r\   r]   XpermXref
ref_outputX_testrU   r   rY   test_outputs1                                                    r(   _test_grouped_gemm_forwardr   d   s   0 v 	@I>9.P[RXQ\\n^k]op	
 _VW$/NN[000**$$$$  !!%!BBM D**K**K"A+"5"55JhhGAq!881q5!}$$$"Aww
 
 	C qwwi~j\qcB	C 

 ww
 
 	Q qwwi~j\$uQCP	Q 

 $L,2L!a%(q8IL+t;kL(  $$R(L}}R H*=hVW*X'~,..."#q(((177#JD$A~t,ED -3a((q8IK	u{{m#3L>qcJKI $!?RSJ W+6>>?T@TU 3:
 7/'''!#!!)++)
 #)++8<d!!) 1"
  + 


 
 *	

 ,
 ,9<d
 "
 "
 *
 ,
 ,
 *
  
 (
 (
  (!
" "#
$ $%
& '
* |+++,,,z>:
 ":~>J#K@K,q$w"77
>>KT [	&
[(@'E'E'G'K'K'M'R'R'TUX&YZ[ r*   kernel_configc                 "    U R                  SSS9$ NTinclude_tuning_paramsinclude_tma	to_stringxs    r(   <lambda>r         AKKDKQr*   )idsc                 n    SU R                    SU R                   SU R                   SU R                   3$ Nztopk=z num_experts=z hidden_size=z intermediate_size=rA   rN   re   rd   r   s    r(   r   r     A    eAFF8=}Q]]O[nop  pC  pC  oD  Er*   c                 :    SU R                    SU R                   3$ Nzseq_len=z dtype=rc   rB   r   s    r(   r   r         8AII;gaggY1Wr*   c                     SU  3$ Nzuse_W1=r_   r   s    r(   r   r   !  
    GA3-r*   c                 4    [        SU UUS.[        U5      D6  g )N)r+   r,   r#   r_   r   r   r+   r,   r   r#   s       r(    test_grouped_gemm_forward_manualr     s)    (  !# 
	r*   c                 "    U R                  SSS9$ r   r   r   s    r(   r   r   3  r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r   8  r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r   ;  r   r*   c                     SU  3$ r   r_   r   s    r(   r   r   =  r   r*   c                 6    [        SU UUSS.[        U5      D6  g NTr+   r,   r#   r9   r_   r   r   s       r(   )test_grouped_gemm_forward_manual_autogradr   0  s,    (  !#	
 
r*   
   c                     SU  3$ Nznum_autotune_configs=r_   r   s    r(   r   r   N      4I!2Mr*   c                     U (       a  S$ S$ Nr!    r_   r   s    r(   r   r   Q      q0Hb0Hr*   c                     U (       a  S$ S$ Nr"   r   r_   r   s    r(   r   r   T  r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r   Y  r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r   \  r   r*   c                     SU  3$ r   r_   r   s    r(   r   r   ^  r   r*   c                 $    [        U UUUUUSSS9  g )NTFr+   r,   r!   r"   r#   r7   r6   r9   r   r+   r,   r!   r"   r#   r7   s         r(   "test_grouped_gemm_forward_autotuner   M  s$    4 !#3	r*   c                     SU  3$ r   r_   r   s    r(   r   r   t  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   w  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   z  r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 $    [        U UUUUUSSS9  g )NTr   r   r   s         r(   +test_grouped_gemm_forward_autotune_autogradr   s  s$    4 !#3	r*   use_tma_load_dyc                    [        X#USS9(       d"  [        R                  " SU< SU< SU< 35        U(       a  U(       d  [        R                  " S5        U(       a6  UR                  S::  a&  UR                  S::  a  [        R                  " S5        UR                  S	:  a  S
Ul        UR                  S	:  a  S
Ul        U(       + n[        U R                  U R                  -  UR                  UR                  UR                  UR                  U R                  SS9u  nnnnnUR                  nUR                  nUR                  nUR                  nU(       a  UOUnU R                  U R                  -  nUU-  nUR                  u  n n!n"UR                  U SU"-  U!4:X  d   eU(       a  UOUn#U(       a-  UR                  UU!4:X  d   SUR                   SU SU! 35       eO,UR                  UU"4:X  d   SUR                   SU SU" 35       eU#R                  5       R                  5       R!                  S5      n$[#        UUUUS9u  n%n&U%R%                  S5      n%U&R%                  S5      n&['        U&U S9u  n'n([)        U(5      U:X  d   e[)        U'5      U:X  d   e[*        UR                     u  n)n*[-        UU(U5      n+UR/                  5         U#R/                  5         U+R/                  5         U(       a  U+R                  UU!4:X  d   eUU"4(       d   eU(       a  USU"-  4OUU!4n,[1        U+U#U'S9n-U-R                  U,:X  d   SU-R                   SU, 35       eU(       a  [3        U-U(5      n-[4        R6                  " U-5      n.U-R9                  U.5        UR:                  c   eU#R:                  c   eU+R:                  n/U(       a  SSKJn0  U	b  U0R@                  S U	 U0l         U(       Ga  SSK!J"n1  U(       d%  [G        5       n2[I        UUUU
UUUUS9n3[K        5       n4O=SSKJ&n5Jn0  SSK'J(n6  U	b(  U0R@                  S U	 U0l         U6R@                  S U	 U6l         S n2S n3U(       a-  UR                  5       R                  5       R!                  S5      O,U+R                  5       R                  5       R!                  S5      n7U1" U7U$U'U(UUUUU2U3USS9n8U8R                  U-R                  :X  d!   SU8R                   S U-R                   35       e[4        RR                  " U8U-U)U*S!9(       d8   S"U8U--
  RU                  5       RW                  5       RY                  5       S# 35       eU8R9                  U.5        U7R:                  c   eU(       Ga  U(       Ga}  [3        U+R:                  U(5      n9U9R%                  UUU!5      R[                  S$S%9n:U:R                  U7R:                  R                  :X  d+   S&U:R                   S'U7R:                  R                   35       e[4        RR                  " U:U7R:                  U)U*S!9(       dB   S"U:U7R:                  -
  RU                  5       RW                  5       RY                  5       S# 35       eU7R:                  U:-
  RU                  5       RW                  5       RY                  5       n;U7R:                  UR:                  -
  RU                  5       RW                  5       RY                  5       n<[]        S(U;S# S)U<S# 35        g [4        RR                  " U7R:                  U/U)U*S!9(       dB   S"U7R:                  U/-
  RU                  5       RW                  5       RY                  5       S# 35       eg [_        SA0 S*U._S+U$_S,U(_S-U'_S.U_S/U_S0U_S1U_S2U_S3U_S4U_S5U
_S6U_S7U_S8U_S9U_S:U_6n=U(       a  U(       a  [3        U/U(5      n/U/R                  U=R                  :X  d!   S;U/R                   S<U=R                   35       eU/U=-
  RU                  5       RW                  5       RY                  5       n>[4        RR                  " U/U=U)U*S!9(       d   S=U>S# 35       eU(       Ga  U(       a  U/R%                  UUU!5      R[                  S$S%9n?U=R%                  UUU!5      R[                  S$S%9n@UR:                  U?-
  RU                  5       RW                  5       RY                  5       nAUR:                  U@-
  RU                  5       RW                  5       RY                  5       nBU?U@-
  RU                  5       RW                  5       RY                  5       nC[]        S>UAS# S?UBS# S@UCS# 35        g g g )BNT)r#   r%   r;   r   r   r<      z+Skipping autotuning for small model configsi   i   r=   r>   r?   r@   rA   rB   requires_gradrC   rD   rE   rF   rO   rH   rI   rL   rM   rP   zref_output.shape: z, output_shape: r   )!_autotuned_grouped_gemm_dX_kernelrV   )r   r.   r0   r1   r2   r3   r4   r5   )!_autotuned_grouped_gemm_dW_kernelr   rT   )rQ   rR   rS   rW   rA   r!   r"   r6   rY   kernel_config_bwd_dXrZ   dX_onlyztest_output.shape: z, ref_output.shape: r[   z3Grouped gemm backward_dX forward outputs mismatch: r^   r   )dimzmanual_grad_check.shape: z, X_.grad.shape: zmanual_diff: z, autograd_diff: dYrR   rW   rS   rA   r!   r"   r.   r   r0   r6   r1   r2   r3   r4   r5   r-   z<Grouped gemm manual backward_dX outputs mismatch: ref_grad: z, dX_test: z2Grouped gemm manual backward_dX outputs mismatch: zdiff_ref_check: z, diff_test_check: z, diff_check_test: r_   )0r)   r`   ra   rd   re   r   rb   rc   rN   rA   rB   rJ   rK   rf   detachclonerequires_grad_r   rg   r   rh   r   r   retain_gradr   r   rl   
randn_likebackwardgradgrouped_gemm.kernels.backwardr   rj   rk   r   r   r
   r	   r   ri   rU   rm   rn   ro   rp   sumr    r   )Dr+   r,   r!   r"   r   r.   r0   r#   r6   r7   r1   r2   r3   r4   r5   r-   r8   r9   r$   r'   rq   rr   rs   rt   ru   rA   rN   rJ   rK   rQ   rv   rw   r@   r?   r>   rR   W_testrX   ry   rz   rW   r\   r]   r{   rx   r}   grad_outputref_gradr   r   rY   r   kernel_config_bwd_dWr   rU   X_r   X_grad_unpermmanual_grad_checkmanual_diffautograd_diffdX_testdiffdX_ref_checkdX_test_checkdiff_ref_checkdiff_test_checkdiff_check_testsD                                                                       r(   _test_grouped_gemm_backward_dXr     s	   * iVSWX@I>9.P[RXQ\]	
 _VW 	**c1$$+AB %%,)-&$&#' ZF$/NN[000**$$$$  !!%!BBM D**K**K**K"A+"5"55J$LhhGAq!881q5!}$$$"Aww
 
 	C qwwi~j\qcB	C 

 ww
 
 	G qwwi/~U1#F	G 

 XXZ..t4F+t;kL(  $$R(L}}R H*=hVW*X'~,..."#{222177#JD$A~t,E MMOMMO	/55;;<++LLL!;LLL,2L!a%(q8IL#1@STJL(M	J,,--=l^LM( z>:
"":.K$6666zzHS+199:O;OP .5 7 3 5#:"1!/ -+++%'	$  $;#<  $/5==>S?ST 29 ;BB-- 7> !%#'   HHJ--d3%%'66t< 	
 #)+!! 1#7"
 !1!11	[ !2!2 33G
HXHXGYZ	[1~~D
 	u@+PZBZA_A_AaAeAeAgAlAlAnor@st	u 
 	[)ww""" %ejj.AM - 2 2:tQ G K KRS K T!''277==8e*+<+B+B*CCTUWU\U\UbUbTcde8>>!2774 |DFWZ\ZaZaFaEfEfEhElElEnEsEsEuvyDz{|  77%66;;=AACHHJKWWqvv-22488:??AMM+c!22CMRUCVWX
 	 >>$t sDbggPXFXE]E]E_EcEcEeEjEjElmpDqrs  	! 


 ,
 *	

 
 "
 "
 ,
 .
 *
  
 (
 (
 (
 "
  $!
" #
. VX~6 	'--'q	EhnnEUU`ahanan`opq'w##%))+002D>>'$t G	;D:FG  V  }}Zq9==A=FZq9==A=F&&</446::<AAC66M1668<<>CCE'-7<<>BBDIIK~c22EoVYEZZmn}  B  nC  D	
 yr*   c                 "    U R                  SSS9$ r   r   r   s    r(   r   r     r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 6    [        SU UUSS.[        U5      D6  g NFr   r_   r   r   r   s       r(   $test_grouped_gemm_backward_dX_manualr     s,    ( # !#	
 
r*   c                 "    U R                  SSS9$ r   r   r   s    r(   r   r     r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 6    [        SU UUSS.[        U5      D6  g r   r   r   s       r(   -test_grouped_gemm_backward_dX_manual_autogradr     s,    ( # !#	
 
r*      c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r      r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r     r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 $    [        U UUUUSSUS9  g )NTFr+   r,   r!   r"   r#   r6   r9   r7   r   r   s         r(   &test_grouped_gemm_backward_dX_autotuner     s$    6 #!#3	r*   c                     SU  3$ r   r_   r   s    r(   r   r   $  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   '  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   *  r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r   /  r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r   2  r   r*   c                     SU  3$ r   r_   r   s    r(   r   r   4  r   r*   c                 $    [        U UUUUSSUS9  g )NTr   r   r   s         r(   /test_grouped_gemm_backward_dX_autotune_autogradr  #  s$    6 #!#3	r*   debugc                    [        UUUUSS9(       d"  [        R                  " SU< SU< SU< 35        U(       a  U(       d  [        R                  " S5        [        U R                  U R
                  -  UR                  UR                  UR                  UR                  U R                  SS9u  nnnnnUR                  nUR                  nUR                  nUR                  nU(       a  UOUnU R                  U R
                  -  nUR                  u  nn n!UR                  USU!-  U 4:X  d   eU(       a  UOUn"U(       a-  UR                  UU 4:X  d   S	UR                   S
U SU  35       eO2UR                  UU-  U!4:X  d   S	UR                   S
U SU SU! 35       eUU-  n#U(       a  U#SU!-  4OU#U 4n$UR                  5       R                  5       R!                  S5      n%U"R                  5       R                  5       R!                  S5      n&[#        UUUUS9u  n'n(U'R%                  S5      n'U(R%                  S5      n(['        U(US9u  n)n*[)        U*5      U#:X  d   e[)        U)5      U:X  d   e[*        UR                     u  n+n,[-        UU*U5      n-U-R                  5       R                  5       R!                  S5      n.UR/                  5         U"R/                  5         U-R/                  5         U(       a  U-R                  U#U 4:X  d   eU#U!4(       d   eU(       a  U#SU!-  4OU#U 4n$[1        U-U"U)S9n/U/R                  U$:X  d   eU(       a  [3        U/U*5      n/[4        R6                  " U/5      n0U/R9                  U05        UR:                  c   eU"R:                  c   eU(       a  U%OU.n1U(       aK  [4        R<                  " SS9  [?        U5       H(  n2[A        SU2 SU"R:                  U2S S2S S24    35        M*     U(       a  SSK!J"n3  Ub  U3RF                  S U U3l#        U(       Ga  SSK$J%n4  U(       d!  [M        SSSUU	U
UUS9n5[O        UUUUU	U
UUS9n6O;SSK!J"n3  SSK(J)n7  Ub(  U7RF                  S U U7l#        U3RF                  S U U3l#        S n5S n6U4" U1U&U)U*UUUU5U6UUSS9n8U8R                  U/R                  :X  d!   SU8R                   S U/R                   35       e[4        RT                  " U8U/U+U,S!9(       d!   S"U8R                   S U/R                   35       eU8R9                  U05        U&R:                  c   eU&R:                  n9O?[W        S>0 S#U0_S$U1_S%U)_S&U*_S'U_S(U_S)U_S*U_S+U_S,U_S-U_S.U	_S/U
_S0U_S1U_S2U_S3U_S4U_6n9U"R:                  R                  U9R                  :X  d+   S5U"R:                  R                   S6U9R                   35       eU(       Ga  [4        RX                  " 5          [4        RT                  " U"R:                  U9U+U,S!9(       d5  [A        S7U"R:                  R[                  5       R]                  5       S8 35        [A        S9U9R[                  5       R]                  5       S8 35        [?        U5       H  n2[A        SU2 SU"R:                  U2S S2S S24    35        [A        SU2 S:U9U2S S2S S24    35        U"R:                  U2S S 2S S 24   U9U2S S 2S S 24   -
  R_                  5       Ra                  5       R]                  5       n:[A        SU2 S;U:S< 35        M     U"R:                  U9-
  R_                  5       Ra                  5       R]                  5       n;  S=U;S< 35       eU"R:                  U9-
  R_                  5       Ra                  5       R]                  5       n;[4        RT                  " U"R:                  U9U+U,S!9(       d   S=U;S< 35       eg ! , (       d  f       g = f)?NT)r$   r#   r%   r;   r   r   r<   r   rC   rD   rE   rF   rG   rH   rI   rL   rM   rP      )	precisionzExpert z weight grad:
   r   )r   rV   F)r.   r/   r0   r1   r2   r3   r4   r5   )r   r/   r0   r1   r2   r3   r4   r5   rT   )rQ   rR   rS   rW   rA   r!   r"   rY   r   r6   rZ   dW_onlyz4Grouped gemm autograd backward_dW outputs mismatch: z != r[   z<Grouped gemm autograd backward_dW forward outputs mismatch: r   rQ   rS   rW   rA   r!   r"   r   r/   r0   r1   r2   r3   r4   r5   r-   r6   r  z:Grouped gemm manual backward_dW outputs mismatch: W.grad: z, dW_test: zRef Wgrad sum: z.4fzTest Wgrad sum: z
 dW_test:
z diff: r^   z2Grouped gemm manual backward_dW outputs mismatch: r_   )1r)   r`   ra   r   rb   rc   rd   re   rN   rA   rB   rJ   rK   rf   r   r   r   r   rg   r   rh   r   r   r   r   r   rl   r   r   r   set_printoptionsranger    r   r   rj   rk   r   r   r	   ri   rU   rm   r   no_gradr   rp   rn   ro   )<r+   r,   r!   r"   r#   r   r/   r0   r1   r2   r3   r4   r5   r-   r6   r7   r8   r  r$   r9   rq   rr   rs   rt   ru   rA   rN   rJ   rK   rQ   rv   r@   r?   r>   rR   rw   rx   r~   r   rX   ry   rz   rW   r\   r]   r{   
Xperm_testr}   r   r   ir   r   rY   r   rU   r   dW_testexpert_diffr   s<                                                               r(   _test_grouped_gemm_backward_dWr  J  s   , % 	@I>9.P[RXQ\]	
 _VW$/NN[000**$$$$  !!%!BBM D**K**K**K"A+"5"55JhhGAq!881q5!}$$$"Aww
 
 	C qwwi~j\qcB	C 

 ww
 
 	Q qwwi~j\$uQCP	Q 

 $L,2L!a%(q8ILXXZ..t4FXXZ..t4F+t;kL(  $$R(L}}R H*=hVW*X'~,..."#{222177#JD$A~t,E%%'66t<J MMOMMO	/55;;<++LLL!;LLL,2L!a%(q8IL#1@STJ|+++ z>:
"":.K$6666 *B1-{#AGA3oaffQBQBY.?-@AB $ S+199:O;OP .5 7 3!&!& %+++%'
! $;"1!/ -+++%'	$  X $/:BB-- 7> 6==>S?ST 29 !%#' ")+!! 1#7"
 !1!11	lA+BSBSATTXYcYiYiXjk	l1~~D
 	tI+J[J[I\\`akaqaq`rs	t 
 	[){{&&&++! 


 *
 ,	

 
 "
 "
 .
 ,
 *
 (
 (
 (
 "
 $
  !
"  #
$ %
* 	
%m	CAFFLL>Q\]d]j]j\klm% ]]_>>!&&'$tL

(9(9(;C'@AB$W[[]%7%7%9#$>?@;'s/!&&BQB2C1DEFs+ga!RaRi.@-ABC vvaAgAq1AAFFHLLNSSUs'+c):;<	 ( FFW$))+//1668DOCD:NO  %%'++-224~~FFGD
 	K?SzJ	K 
# _s   5E=_
_+c                 "    U R                  SSS9$ NFTr   r   r   s    r(   r   r   ;      AKKTKRr*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r   @  r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r   C  r   r*   c                     SU  3$ r   r_   r   s    r(   r   r   E  r   r*   c                 6    [        SU UUSS.[        U5      D6  g r   r  r   r+   r,   r   r#   r  s        r(   $test_grouped_gemm_backward_dW_manualr  8  s,    * # !#	
 
r*   c                 "    U R                  SSS9$ r  r   r   s    r(   r   r   Y  r  r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r   ^  r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r   a  r   r*   c                     SU  3$ r   r_   r   s    r(   r   r   c  r   r*   c                 6    [        SU UUSS.[        U5      D6  g r   r  r  s        r(   -test_grouped_gemm_backward_dW_manual_autogradr"  V  s,    * # !#	
 
r*   c                     SU  3$ r   r_   r   s    r(   r   r   u  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   x  r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r   {  r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 $    [        U UUUUSSUS9  g )NTFr+   r,   r#   r!   r"   r6   r9   r7   r  r   s         r(   &test_grouped_gemm_backward_dW_autotuner,  t  s$    4 #!#3	r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r     r   r*   c                     U (       a  S$ S$ r   r_   r   s    r(   r   r     r   r*   c                 n    SU R                    SU R                   SU R                   SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                 :    SU R                    SU R                   3$ r   r   r   s    r(   r   r     r   r*   c                     SU  3$ r   r_   r   s    r(   r   r     r   r*   c                 $    [        U UUUUSSUS9  g )NTr*  r+  r   s         r(   /test_grouped_gemm_backward_dW_autotune_autogradr4    s$    4 #!#3	r*   )FFF)FTFFFNNNNNFNFF)FFFFFTFNNNNNNTFFF)FFFNNNNNTFNFFFF)F)5dataclassesr   r`   rl   rk   r   r   r   r   grouped_gemm.kernels.tuningr   r	   r
   r   grouped_gemm.reference.moe_opsr   r   r   r   r   commonr   r   r   r   r   r   r   r   r   r   r   SEEDr)   boolintr   markparametrizer   r   r   r   r   r   r   r   r  r  r  r"  r,  r4  r_   r*   r(   <module>r>     s2            	 Y^:2     $!-l[l[l[ l[ 	l[
 l[ l[ l[ l[ l[ l[ l[ l[ l[ l[  !l[$ %l[& 'l[* +l[, -l[` 
Q  
 ,.@AA E  
 <'W   D%=8OP ' 	 Q 
Q  
 ,.@AA E  
 <'W   D%=8OP ' 	 Q RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$( RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$(J !  $!'B
B
B
 B
 	B

 B
 B
 B
 B
 B
 B
 B
 B
 B
 B
 B
  !B
" #B
$ %B
& 'B
N 
Q  
 13DEE E  
 <'W   D%=8OP + 	 Q 
Q  
 13DEE E  
 <'W   D%=8OP + 	 Q RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$* RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$6 "  $!)kKkKkK kK 	kK
 kK kK kK kK kK kK kK kK kK kK kK  !kK" #kK$ %kK& 'kK( )kK\ 
R  
 -/@AA E  
 <'W   D%=8OP    	
  Q  
R  
 -/@AA E  
 <'W   D%=8OP    	
  Q  RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$( RD(M   $&H   $&H   *+ E  
 <'W   D%=8OP  	
   Q$r*   