
    oiu                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKr	S SK
r
S SKJrJrJrJr  SrS\	R"                  S\S\S\
R(                  S	\4
S
 jrS\\   S\S\S\
R(                  S	\4
S jrS\	R"                  S\S\S\S\
R(                  S	\4S jrS\ R4                  S\S\4S jrS rSS jrS rS rS r S r!g)    N)product)KernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForwardKernelResult*   dfmodeseqlendtypeautotunec                     U R                   R                  5       nUUUUS.n[        UR                  5       5      nU H	  nXh   X'   M     XU-      n U $ )N)r
   r   r   r   )columnsto_listlistkeys)	r	   r
   r   r   r   kernel_result_colstest_config_dicttest_config_colscols	            ]/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/benchmark/utils.pycreate_merged_resultsr      sk     ++-	 ,1134"'   
11	2BI    resultsc                 J    [         R                  " U SS9n[        XQX#U5      nU$ )Nspeedup)sort_by)r   to_dataframer   )r   r
   r   r   r   r	   s         r   post_process_resultsr   (   s)     
	"	"7i	@B	r	ABIr   results_dirc           	      n   [         R                   R                  5       R                  S5      nU SU 3nU SU SU S[        U5      R	                  S5      S    S3n[
        R                  R                  U5      (       d  [
        R                  " U5        [        SU 35        U R                  USS	9  g )
N%Y%m%d_%H%M/_.z.csvzSaving results to F)index)datetimenowstrftimestrsplitospathexistsmakedirsprintto_csv)	r	   r    r
   r   r   r   dtsave_dir	save_paths	            r   save_resultsr6   4   s     
					 	)	)-	8Bav&H*AbT6(!CJ,<,<S,A",E+FdKI77>>(##
H	yk
*+IIiI'r   args	permute_x	permute_yc                 l   [        U R                  S   U R                  S   5      n[        U R                  S   U R                  S   5      n[        U R                  S   U R                  S   5      n[	        U R
                  S   U R
                  S   SS9n[	        U R                  S   U R                  S   SS9nU R                  n/ n	[        UUUUUSS/SS/5       Hr  u  n
nnnnnnUS:X  a  [        U
UUUUUUUUS9	nO>US	:X  a  [        U
UUUUUUUUS
9	nO&US:X  a  [        U
UUUUUUUUS9	nO[        SU 35      eU	R                  U5        Mt     [        R                  " S[!        U	5       S35        / nU	 H  nUS:X  a  U(       a  UR"                  (       a  M#  O[US	:X  a5  U(       a  UR"                  (       a  MD  U(       a  UR$                  (       a  M^  O US:X  a  U(       a  UR$                  (       a  M  UR                  U5        M     [        R                  " S[!        U5       S35        U$ )Nr         )stepTFforward)	BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_K	num_warps
num_stagesuse_tma_load_wuse_tma_load_xr8   r9   dW)	r?   r@   rA   rB   rC   use_tma_load_dyrE   r8   r9   dX)	r?   r@   rA   rB   rC   rG   rD   r8   r9   Invalid mode: zPruning z kernel configszAfter pruning, )power_of_two_ranger?   r@   rA   multiples_of_rangerB   rC   r
   r   r   r   r   
ValueErrorappendlogginginfolenrE   rG   )r7   r8   r9   block_m_rangeblock_n_rangeblock_k_rangenum_warps_rangenum_stages_ranger
   kernel_configsblock_mblock_nblock_krB   rC   
tma_load_a
tma_load_bkernel_configpruned_configsconfigs                       r   create_kernel_configsr_   E   s_   &t'8'8';T=N=Nq=QRM&t'8'8';T=N=Nq=QRM&t'8'8';T=N=Nq=QRM():DNN1<MVWXO)DOOA.q 99DN 
	u	u
	 9/&&&%'!+!+%%
M T\3&&&%'",!+%%
M T\3&&&%'",!+%%
M ~dV455m,_
b LL8C/0@AN 9V22T\V22V33T\V33f% ! LL?3~#6"7GHr   c                     [         R                  " U 5      n [         R                  " U5      n[        [        U 5      [        U5      S-   5       Vs/ s H  nSU-  PM
     sn$ s  snf )Nr;   r<   )mathlog2rangeint)startendis      r   rJ   rJ      sN    IIeE
))C.CE
CHqL9:9QAqD9:::s   A"c                 2    [        [        XU-   U5      5      $ N)r   rc   )re   rf   r=   s      r   rK   rK      s    e4Z.//r   c                     g ri    )keyr
   s     r   map_key_to_argsrm      s    r   c           	         [         R                  R                  5       R                  SS5      n[        R                  R                  5       R                  S5      nU SU SU SU 3n[        R                  R                  U5      (       d  [        R                  " U5        U R                  5        H  u  pU V
s/ s H:  n
S[        U
5      ;  a  [        U
5      O[        U
R                  S5      S   5      PM<     nn
SR                  U5      nU SU S	3n[        S
U 35        [!        US5       n0 U	R#                  5       EUUS.En[$        R&                  " X5        S S S 5        M     g s  sn
f ! , (       d  f       M  = f)N r$   r"   r#   z
/autotune/torchztorch.r&   z.jsonzSaving autotune results to w)ref_time
fused_time)rp   cudaget_device_namereplacer(   r)   r*   r-   r.   r/   r0   itemsr+   r,   joinr1   open
all_kwargsjsondump)autotune_cacher
   rr   rs   r    device_namer3   r4   rl   r^   kfilenamer5   fresults                  r   save_autotune_resultsr      sY   **,,.66sC@K						 	)	)-	8BavZt1[MBH77>>(##
H%++-UX
UXPQ'SV+CFQWWX5Fr5J1KKUX 	 
 88C=j(51	+I;78)S!Q##%$(F
 IIf  "! .
 "!s   <AE-2.E22
F	c                     U S:X  a  SSK Jn  U$ U S:X  a  SSKJn  U$ U S:X  a  SSKJn  U$ U S:X  a  SS	KJnJn  X#4$ [        S
U  35      e)Nr>   r   )&_autotuned_grouped_gemm_forward_kernelrF   )!_autotuned_grouped_gemm_dW_kernelrH   )!_autotuned_grouped_gemm_dX_kernelbackward)r   r   rI   )grouped_gemm.kernels.forwardr   grouped_gemm.kernels.backwardr   r   rL   )r
   r   r   r   s       r   get_autotunerr      s]    yW55	S00	S00			

 1SS>$011r   c           	          U R                   R                  5        H&  u  pV[        U SU SUR                  5        35        M(     [	        U R                   UUUUS9  g )Nro   z: )r
   rr   rs   r    )cacherw   r1   rz   r   )	autotunerr
   rr   rs   r    rl   values          r   postprocess_autotune_resultsr      sY    oo++-
auBu//1234 .!r   )r;   )"argparser(   r{   rN   ra   r-   	itertoolsr   pandaspdrp   grouped_gemm.kernels.tuningr   r   r   r   SEED	DataFramer+   rd   r   boolr   r   r   r6   	Namespacer_   rJ   rK   rm   r   r   r   rk   r   r   <module>r      s+        	     

),5:[[LP$	,	
	 	 ;;		
 	(
(( ( 	(
 ;;( ("V 2 2 Vt VPT Vr;0	!.20	r   