
    oi                         % S SK r S SKrS SKJr  S SKJr  S SKrS SKrS SKJ	r	  S SK
JrJr  S SKJr  S SKJrJrJr  S SKJrJr  \R,                  S	\R.                  S
\R0                  S0rSrSrS/r\R,                  /rSr\S;S j5       rS r S r!S\RD                  4S jr#\RH                  " 5          S<S\RD                  S\RD                  4S jj5       r%S\RD                  S\RD                  S\RL                  RN                  4S jr(   S<S\RL                  RN                  S\RL                  RN                  4S jjr)\RT                  S  5       r+\RX                  R[                  S!SS"/S# S$9\RX                  R[                  S%SS"/S& S$9\RX                  R[                  S'S/S( S$9\RX                  R[                  S)S"/S* S$9\RX                  R[                  S+\S, S$9\RX                  R[                  S-\\.S$9    S=S-\R^                  S)\0S'\0S%\0S!\0S.\S/\14S0 jj5       5       5       5       5       5       r2\3S1:X  a  \ Rh                  " 5       r5\5Rm                  S2\1SS39  \5Rm                  S4\.S5S6/S5S79  \5Ro                  5       r8\9" \\8R^                  5      \8l/        \:" \85      r;\r<\ " \<5      r=\\>S8'   S9 H$  r?\2" \8R                  \=\8R^                  S"SS"\?S"S:9  M&     gg)>    N)contextmanager)partial)
AutoConfig)Llama4ConfigLlama4TextConfig)Llama4TextMoe)KernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForward)Llama4GroupedGemmTextMoeLlama4TritonTextMoe){Gz?r   )MbP?r   )h㈵>r   z meta-llama/Llama-4-Scout-17B-16E*   i   2   c              #   t   #    [        X#-  5        [        U 5        S v   [        U5        [        X#-  5        g 7fN)print)preludeepiloguechar	num_charss       c/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/tests/test_llama4_moe.pyannotated_contextr   '   s,     	$
	'N		(O	$
s   68c                 F    [         R                  " U 5      nUR                  $ r   )r   from_pretrainedtext_config)model_idconfigs     r   get_text_configr!   0   s    %55h?F    c                    U (       d  [        5       n[        5       n[        5       nO\SSKJnJn  SSKJn  UR                  S [         Ul        UR                  S [         Ul        UR                  S [         Ul        S nS nS nXU4$ )Nr   )!_autotuned_grouped_gemm_dW_kernel!_autotuned_grouped_gemm_dX_kernel)&_autotuned_grouped_gemm_forward_kernel)
r   r	   r
   grouped_gemm.kernels.backwardr$   r%   grouped_gemm.kernels.forwardr&   configsNUM_AUTOTUNE_CONFIGS)autotunekernel_config_fwdkernel_config_bwd_dWkernel_config_bwd_dXr$   r%   r&   s          r   prep_triton_kernel_traitsr/   5   s    /16868	
 	X 3::;P<PQ 	/6 .556K7KL 	*1 .556K7KL 	*1 !##4HHHr"   tc                 B    U R                  SS9R                  S5      n U $ )Nr   )dim)sumview)r0   s    r   sparse_to_denser6   S   s!    	ABAHr"   Ft1t2c                    UR                  U 5      nU R                  U5      R                  5       R                  5       R	                  5       nU(       a  US:X  a  Sn[        U SXt  35        [        R                  " XX#S9(       d   eg )N diffz: )atolrtol)view_assubabsmaxitemr   torchallclose)r7   r8   r<   r=   	precisionverbosemsgr;   s           r   _check_diffrH   X   sp     
BB66":>>!&&(D"9CR[)*+>>";;;r"   ygrad_outputmodulec                     U R                  U5        UR                  5        H  u  p4UR                  b  M   U S35       e   g )Nz missing grad!)backwardnamed_parametersgrad)rI   rJ   rK   nameparams        r   run_backwardsrR   k   s?    JJ{..0zz%>$~'>>% 1r"   m1m2c                     U R                  5        H=  u  px[        UR                  UR                  U5      R                  UUUUU SU S3S9  M?     g )N:z.grad)r<   r=   rE   rF   rG   )rN   rH   rO   get_parameter)	rS   rT   r<   r=   rE   rF   rG   rP   rQ   s	            r   _check_gradsrX   q   sX     **,JJT"''!E4&&	
 -r"   c                  J    [         R                  " [        5      R                  $ r   )r   r   LLAMA4_SCOUT_IDr    r"   r   model_configr\      s    %%o6BBBr"   overlap_router_sharedTc                     U (       a  S$ S$ )Nr]   
no_overlapr[   xs    r   <lambda>rb      s    q+BlBr"   )ids	permute_yc                     U (       a  S$ S$ )Nrd   no_permute_yr[   r`   s    r   rb   rb      s    q0Tn0Tr"   	permute_xc                     U (       a  S$ S$ )Nrg   no_permute_xr[   r`   s    r   rb   rb      s    +*N*Nr"   r+   c                     U (       a  S$ S$ )Nr+   manualr[   r`   s    r   rb   rb      s    a
(EX(Er"   seqlenc                     SU  3$ )Nzseqlen=r[   r`   s    r   rb   rb      s
    wqc]r"   dtyper\   bsc                    [         R                  " [        5        SnUR                  n[        U    u  p[        [        XXS9n[        [        XXS9n[        U5      R                  XS9n[        XeS9R                  XS9nUR                  U5        UR                  U5        [         R                  " XqXUSS9nUR                  5       R                  5       R!                  5       nUR                  5       R                  5       R!                  5       nU" U5      u  nnU" U5      u  nnUR"                  UR"                  :X  d    UR"                   SUR"                   35       e[%        S5         U" UUS	S
9  U" ['        U5      USS
9  S S S 5        [)        U5      u  nnn[+        UUUUUUUUS9R                  XS9nUR                  U5        UR                  U5        U" U5      u  nn[%        S5         U" UUSS
9  U" ['        U5      USS
9  S S S 5        [         R,                  " U5      n[/        UUU5        [/        UUU5        [%        S5         U" UUSS
9  S S S 5        [/        UUU5        [%        S5         U" UUSS
9  S S S 5        g ! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       N[= f! , (       d  f       g = f)Ncuda)r<   r=   rE   rF   )rn   device)r]   T)rn   rr   requires_gradz != z(Testing torch grouped gemm Llama4TextMoe
y_torch_gg)rG   routing_torch_gg)r]   rg   rd   r+   r,   r-   r.   )rr   rn   z1Testing triton grouped gemm Llama4TextMoe forwardy_tritonrouting_tritonz/Testing torch group gemm Llama4TextMoe backwardtorch_ggz0Testing triton group gemm Llama4TextMoe backwardtriton)rC   manual_seedSEEDhidden_size
TOLERANCESr   rH   rX   r   tor   copy_weightscheck_weightsrandndetachclonerequires_grad_shaper   r6   r/   r   
randn_likerR   ) rn   rl   r+   rg   rd   r]   r\   ro   rr   rE   rF   
hidden_dimr<   r=   
check_diffcheck_grads
llama4_refllama4_gg_refx_ref
x_torch_ggx_tritony_refrouting_refrt   ru   r,   r-   r.   llama4_tritonrv   rw   ref_grads                                    r   test_llama4_refr      s   : 
 F))JE"JDD9J TIK
 |,///OJ -bb(  z*
+KK
JPTE %%'668J||~##%446H#E*E;#0#< J ;;****Ru{{m4
@P@P?Q,RR*	E	F5*L9K(*:BT	
 
G 	"(+ B+-A ( 5-33	 	b&b(  z*
+,X6Hn	N	O5(*5?;/GWX 
P &H%:.*h6	L	MJZ@ 
N (Hm4	M	NJX> 
O	NG 
G	F0 
P	O 
N	M 
O	Ns0   ,J
2J	J-8	J>

J
J*-
J;>
K__main__z--seqlen)typedefaultz--dtypebfloat16float16)r   choicesr   r   )FT)rl   r\   rn   r+   rg   rd   r]   rF   )zPassed!-P   ).6fFr:   )   rq   r   F)Aargparsesys
contextlibr   	functoolsr   pytestrC   transformersr   transformers.models.llama4r   r   *transformers.models.llama4.modeling_llama4r   grouped_gemm.kernels.tuningr	   r
   r   (grouped_gemm.reference.layers.llama4_moer   r   r   r   floatr}   rZ   r{   SEQ_LENSDTYPESr*   r   r!   r/   Tensorr6   no_gradrH   nnModulerR   rX   fixturer\   markparametrizestrrn   boolintr   __name__ArgumentParserparseradd_argument
parse_argsargsgetattrvars	args_dictr   r   __annotations__overlaprl   r[   r"   r   <module>r      sn    
 %    # E D 
 
NNL	MM<	KK
 5	6
..	   
I<u|| 
  
<<< <$?U\\ ? ?ehhoo ? 



* C C 
DM
B  
 %&T   % N   E   83JK&4 Q?;;Q? Q? 	Q?
 Q?  Q? #Q? 	Q? 5 L Q?h z$$&F

3$?
#*i)@J   D

+DJT
IH$3H$=K!= [[&JJ$+		
 ! r"   