
    oi7                     &   S SK r S SKrS SKJr  S SKrS SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  S S	KJrJrJrJrJr  S S
KJrJrJrJrJr  S SKJrJrJ r J!r!J"r"  S SK#J$r$  S SK%J&r&  Sr'Sr(Sr)  STS\RT                  RV                  S\RT                  RV                  S\S\,S\RZ                  S\.S\ S\,4S jjr/ SUS\RT                  RV                  S\RT                  RV                  S\S\,S\RZ                  4
S jjr0    SVS\\-  4S jjr1     SWS\2S\\-  S\,S\RZ                  S\.S\.S\.S\ S \S!\S"\.S#\24S$ jjr3\4S%:X  Ga  \ Rj                  " 5       r6\6Ro                  S&\2S'S(9  \6Ro                  S)\2S*S+/S,S-9  \6Ro                  S.\,S/S(9  \6Ro                  S0\2S1S2/S1S39  \6Ro                  S4S5S69  \6Ro                  S7S5S69  \6Ro                  S8S5S69  \6Ro                  S9S5S69  \6Ro                  S:S;\,\S    \S<   /S=9  \6Ro                  S>S;\,\S    \S<   /S=9  \6Ro                  S?S;\,\S    \S<   /S=9  \6Ro                  S@S;\,\S    \S<   /S=9  \6Ro                  SAS;\,\S    \S<   /S=9  \6Ro                  SBS5S69  \6Ro                  SCS5S69  \6Ro                  SDS5S69  \6Ro                  SE\2/ SFQSGS39  \6Rq                  5       r9\:" \\9RZ                  5      \9l-        \9Rv                  S+:X  a  \)O\(r<\Rz                  " \<5      r>\9Rv                  S*:X  a  \>R~                  O\>r>\9R                  r@\9R                  (       a  \B" SH\< SI\@ SJ\9R                   SK\9RZ                   SL\9R                   SM\9R                   SN35        \R                  " 5       rF\3" \9R                  \>\9R                  \9RZ                  \9R                  \9R                  \9R                  \9R                  \9R                  SO9	u  rIrJ\R                  " 5       rK\B" SP\K\F-
  SQ SR35        g SS5       eg)X    N)nullcontext)
AutoConfig)Llama4TextConfig)Llama4TextMoe)Qwen3MoeConfig)Qwen3MoeSparseMoeBlock)do_bench)create_kernel_configsget_autotunerpost_process_resultspostprocess_autotune_resultssave_results)DEFAULT_K_BLOCK_SIZESDEFAULT_M_BLOCK_SIZESDEFAULT_N_BLOCK_SIZESDEFAULT_NUM_STAGESDEFAULT_NUM_WARPS)KernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForwardKernelResultTritonTuningContext)Llama4TritonTextMoe)Qwen3MoeFusedGroupedGEMMBlock*   z meta-llama/Llama-4-Scout-17B-16EzQwen/Qwen3-30B-A3B	ref_modeltt_modelconfigseqlendtypeautotunekernel_config_fwdbsc           	        ^ ^^ [         R                  " [        5        SnUR                  n	[         R                  " XsXUSS9mUU 4S jn
UU4S jn[        U
5      nU(       d  Uc   e[        U5      nO
[        5       nU   [        U5      nS S S 5        U(       d  UR                  (       d  g[        SUS S	WS S
X-  S S35        X4$ ! , (       d  f       ND= f)NcudaTr    devicerequires_gradc                     > T" T 5      $ N )Xr   s   k/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/benchmark/benchmark_fused_moe.py<lambda>'run_benchmark_forward.<locals>.<lambda>?   s	    	!    c                     > T" T 5      $ r*   r+   )r,   r   s   r-   r.   r/   @   s	    (1+r0   )r      zForward: ref .4f, fused 
, speedup .1fx)
torchmanual_seedSEEDhidden_sizerandnr	   r   r   successprint)r   r   r   r   r    r!   r"   r#   r'   r;   bench_forward_refbench_forward_fusedref_forward_timetuning_contextfused_forward_timer,   s   ``             @r-   run_benchmark_forwardrD   *   s     
 F$$K
KQU	A
 -- 12 ,,,,->?$	%&9: 
 ~55	
(-X6H5MZXhX}  B  XC  CD  	E // 
s    C


Cc           	      *  ^^^ [         R                  " [        5        SnUR                  n[         R                  " XSXtUSS9nUR                  5       R                  5       R                  S5      n	U " U5      u  mn
SSKJ	n  UR                  S S Ul
        U" U	5      u  mn
[         R                  " T5      mUU4S jnUU4S jn[        X/U R                  5       QS	9n[        X/UR                  5       QS	9n[        S
US SUS SX-  S S35        X4$ )Nr%   Tr&   r   )&_autotuned_grouped_gemm_forward_kernel   c                  $   > TR                  T SS9$ NT)retain_graphbackward)grad_outputoutputs   r-   r.   (run_benchmark_backward.<locals>.<lambda>u   s    T!Rr0   c                  $   > TR                  T SS9$ rI   rK   )rM   test_outputs   r-   r.   rO   v   s    ;#7#7TX#7#Yr0   )grad_to_nonezBackward: ref r3   r4   r5   r6   r7   )r8   r9   r:   r;   r<   detachclonerequires_grad_grouped_gemm.kernels.forwardrF   configs
randn_liker	   
parametersr>   )r   r   r   r   r    r#   r'   r;   r,   X_test_rF   bench_backward_refbench_backward_fusedref_backward_timefused_backward_timerM   rN   rQ   s                   @@@r-   run_benchmark_backwardr`   V   sN    
 F$$K
KQU	A XXZ..t4F!IFA T 	/66s; +2 f%NK ""6*KRY +G	0D0D0F+G #-Mx7J7J7L-M 

*3/x8KC7PPZ[l  \C  DG  [H  HI  	J 11r0   c                    [        U [        5      (       aH  [        U 5      R                  X5      n[        R
                  " UUUUUUUUU	S9	R                  X5      nX4$ [        U [        5      (       a>  [        U 5      R                  X5      n[        U U
UUUUUUUU	S9
R                  X5      nX4$ [        S[        U 5      R                   35      e)N)	permute_x	permute_yr!   r"   kernel_config_bwd_dWkernel_config_bwd_dXdX_onlydW_only)	overlap_router_sharedrb   rc   r!   r"   rd   re   rf   rg   zUnrecognized config )
isinstancer   r   tor   from_hfr   r   r   
ValueErrortype__name__)r   r    rb   rc   r!   r"   rd   re   rf   rg   rh   r'   r   r   s                 r-   setup_modelro      s     &.))*6255fD	 188!! 1#7#7

 "V
 	< % 
F,	-	-!&),,V;	&$9!! 1#7#7
 "V
 	   /V0E0E/FGHHr0   modemodel_configrb   rc   rd   re   rh   results_dirc                 Z   U(       a  [        U 5      nU S:X  a  SnOU S:X  a  SnOS=p[        UUUUUUUU	WWU
S9u  nnU S:X  a  [        UUUUUUUS9u  nnO[        UUXUS9u  nnU(       a<  U S	:X  a'  Wu  nn[	        USUUU5        [	        USUUU5        UU4$ [	        WU UUU5        UU4$ )
NdWTdXF)
r    rb   rc   r!   r"   rd   re   rf   rg   rh   forward)r   r   r    r!   r"   )r   r   r    rL   )r   ro   rD   r`   r   )rp   rq   r   r    rb   rc   r!   r"   rd   re   rh   rr   	autotunerrg   rf   r   r   ref_time
fused_timeautotuner_dWautotuner_dXs                        r-   run_benchmarkr|      s    !$'	t|	!!%-33 5Ix y4! 1 
*  6x,QV 
* :)2&L,(dHj+ )dHj+ Z	 )4:{ Zr0   __main__z--results_dirbenchmark_results)rm   defaultz--modelllama4qwen3T)rm   choicesrequiredz--seqleni   z--dtypebfloat16float16)rm   r   r   z--permute_x
store_true)actionz--permute_yz
--autotunez--overlap_router_sharedz--BLOCK_SIZE_M   )nargsrm   r   z--BLOCK_SIZE_Nz--BLOCK_SIZE_Kz--num_warpsz--num_stagesz--use_tma_load_wz--use_tma_load_xz--use_tma_load_dyz--mode)rv   rL   rt   ru   rv   zBenchmarking  z	: seqlen=z, dtype=z, permute_x=z, permute_y=z
, autotune)r   r    rb   rc   r!   rh   rr   zTotal time: r3   z secondszUse autotune for now)Nr2   )r2   )FFFr%   )NNNFN)Yargparsetime
contextlibr   r8   transformersr   transformers.models.llama4r   *transformers.models.llama4.modeling_llama4r   transformers.models.qwen3_moer   0transformers.models.qwen3_moe.modeling_qwen3_moer   triton.testingr	   utilsr
   r   r   r   r   grouped_gemm.kernels.autotuningr   r   r   r   r   grouped_gemm.kernels.tuningr   r   r   r   r   (grouped_gemm.reference.layers.llama4_moer   'grouped_gemm.reference.layers.qwen3_moer   r:   	LLAMA4_IDQWEN3_MODEL_IDnnModuleintr    boolrD   r`   ro   strr|   rn   ArgumentParserparseradd_argument
parse_argsargsgetattrmodelmodel_idfrom_pretrainedrq   text_configrp   r!   r>   r   rb   rc   
start_timerh   rr   rx   ry   end_timekernel_configslendefault_kernel_config_fwddefault_kernel_config_bwd_dWdefault_kernel_config_bwd_dXresultskernel_configr"   rd   re   rl   appenddfr+   r0   r-   <module>r      sn     "  # 7 D 8 S #    I Q	.	% .2)0xx)0hhoo)0 )0 	)0
 ;;)0 )0 +)0 	)0d 
+2xx+2hhoo+2 +2 	+2
 ;;+2n !0--0v .24848"'B 
B  #33B  B  ;;	B 
 B  B  B  +B  2B  2B   B  B J z$$&F
?RS
	#(G9LY]^

3$?
#*i)@J   =
=
|<
1LI
(+-B2-FG	   (+-B2-FG	   (+-B2-FG	   $Q'):2)>?	   %a(*<R*@A	   \   \   l   5	   D

+DJ!%w!6~IH--h7L/3zzX/E<++<L99D}}H:QtfIdkk](4::,Vbcgcqcqbrr~  @D  @N  @N  O  OY  Z	
 YY[
,II[[JJ}}$($>$>**
 
* 99;X
237x@A 	-,,uu r0   