
    h/;                        % S SK r S SKrS SKrS SKJr  S SKrS SKrS SKJrJr  S SK	J
r
Jr  S SKJrJrJrJr  \R"                  " SS5      S:g  r\\S'   \ R*                  " 5       S	 5       rS
 rS rS rSSS.S\S\S-  4S jjrS\4S jrS rS\\4S jrS r S r!S\S\S\
RD                  4   4S jr#S\4S jr$S\S\
RJ                  \
RD                  -  4   4S jr&S\4S jr'S r(S\4S  jr)S\S\
RJ                  \
RD                  -  4   4S! jr*S\4S" jr+S\,\   4S# jr-S\S\
RJ                  \
RD                  -  4   4S$ jr.g)%    N)Callable)Configcdiv)	autotunerdriver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmiCCE_AUTOTUNE0	_AUTOTUNEc                       [        S/5      S   S-  $ ! [         aF    SS Kn U R                  5         U R	                  S5      nU R                  XR                  5      S-  s $ f = f)Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r   handles     W/home/james-whalen/.local/lib/python3.13/site-packages/cut_cross_entropy/tl_autotune.pyget_clock_rate_in_khzr      sk    To&'*S00 T2215//8L8LMPSSSTs    AA$#A$c                     U[        US5      -  n[        R                  R                  R	                  U 5      S   S-  n[        XT5      U-  [        U[        5       U 5      -  nU$ z!return compute throughput in TOPS   multiprocessor_count)minr   activeutilsget_device_propertiesr
   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopss          r   get_tensorcore_tflopsr)   !   sp    SA..K11&9:PQTUU  	L&
	
#E+@+BF
K	L 
 M    c                     U[        US5      -  n[        R                  R                  R	                  U 5      S   S-  n[        XT5      U-  [        U[        5       U 5      -  nU$ r   )r   r   r   r   r    r	   r   r!   s          r   get_simd_tflopsr,   /   sp    SA..K11&9:PQTUU  	L&
	
e%:%<f
E	F 
 Mr*   c                     [         R                  R                  U 5      nUS   S:  a   U[         R                  :X  a  [	        XX#5      $ [        XX#5      $ )Nr      )torchcudaget_device_capabilityfloat32r,   r)   )r"   r#   r$   r%   
capabilitys        r   
get_tflopsr4   =   sG    11&9J!}qUemm3vBB 9DDr*   g      ?)shared_memory_factormax_num_warpsr5   r6   c                0  ^ [         R                  R                  5       n[         R                  R                  5       nUS   R	                  5       nUb#  U  Vs/ s H  oR
                  U::  d  M  UPM     n n/ n	U  H  nUR                  n
U
S   U
S   U
S   UR                  4u  pp[        R                  R                  R                  U5      S   nX+U-   -  U-  U-  U-  nUU:  a  Mo  U	R                  U5        M     U	n 0 nU  Ha  nUR                  n
U
S   U
S   U
S   UR
                  UR                  4u  pnnnXUU4nUU;   a  UU   R                  X45        MZ  X4/UU'   Mc     / n	UR                  5        H  u  nnUu  pnnUS   S:  a\  X-  U-  S-  nU[        S	U5      -  S-  nS
nUU-  m[        R                   " SUU4S jS9nU H  nU	R                  US   5        M     Mq  US   S   nSUl        U	R                  U5        M     U	$ s  snf )NEBLOCK_BBLOCK_VBLOCK_Dmax_shared_memr   r.   i   r   i,     c                 T   > U S   T-
  S:  a  S[        U S   T-
  5      -   $ U S   T-
  $ )N   r   
   )abs)xoptimal_num_stagess    r   <lambda>$early_config_prune.<locals>.<lambda>   sE    aD--2 !3qt.@'@#AA /qT../r*   )key)r/   r0   current_devicer1   element_sizer$   kwargs
num_stagesr   r   r   r    appenditemsr   heapq	nsmallest)configs
named_argsr5   r6   rI   r"   r3   dtsizeconfigpruned_configskwr9   r:   r;   rJ   max_shared_memoryrequired_shared_memoryconfigs_mapr$   rF   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrC   s                               @r   early_config_pruner`   D   sr    ZZ&&(F113J_))+F (/Uf3C3C}3T6U N]]yMyMyM	1
-' #MM//EEfMN^_ g$56@:MPVV 	 "$55f%! $ G K]]yMyMyM<
8'9j )4+##V$89!' 45K   N!!#1/0,'9a=A$w.+>DAy 11A5J N!/*!< oo/G %%ad+  aDGM'(M$!!-01 $2 I Vs   H/Hreturnc                 *    SU -  U-  U-  SU -  U-  -   $ )Nr=   r@    BVDs      r   _total_ops_fnrh      s!    q519q=26A:%%r*   c                 
    X-  $ Nrc   re   rf   rg   rQ   	num_cta_b	num_cta_vs         r   _total_store_fnrn      s
    :r*   Fc                    [         R                  R                  5       nUR                  nUR	                  5       n[        X65      n[        XG5      nUU-  n[        X65      [        XG5      pCU
" X4U5      nUS-  n[        UUX5      nUU-  n[        R                  R                  R                  U5      S   n[        SUU-  5      n[        SUS-  5      n[        [        SUS-
  S-  5      S5      n[        U5      US-  US-  -   -  nUS	-  nX5-  U-  SS
US-
  -  -   -  nX5-  U-  S-  US-
  -  nXE-  U-  SS
US-
  -  -   -  nXE-  U-  S-  US-
  -  nUU-   S-  n UU-   S-  n!U U-  U!U-  -   n"US-  n#U" X4X_UU5      S-  n$U$U#-  n%[        UU"5      U%-   n&U	(       a2  [        SU< SU< SU< SU < SU< SU& SU SU" SU% SUS-   S35        U&$ )zCreturn estimated running time in ms
= max(compute, loading) + storei   @r   r?       L   r   gffffff?g?r   皙?g?i   g?zBLOCK_B=z
, BLOCK_V=z
, BLOCK_D=z, num_warps=z, num_stages=z, Total time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r/   r0   rG   r%   rH   r   maxr4   r   r   r   r    r   r   print)'r$   rJ   r8   re   rf   rg   r9   r:   r;   debugtotal_ops_fntotal_store_fnrI   r"   r%   rQ   rl   rm   r#   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bw
store_dramstore_mstotal_time_mss'                                          r   estimate_matmul_timer      sc   $ ZZ&&(FGGE^^FQ IQ I9$H q?COq Q1%I/0Ifh	9DT!J ]]  66v>?UVF1h/0q(R-0s1x"}&BCQGF#t#&:T&AAG aKE%&.Ay1}(=$=>K$	A6I%&.Ay1}(=$=>K$	A6I+<JI%+6H7"X%55G }HaIF+VJH$H
G,x7Mwj7*Kwj9,n N(/);J< H$I%5hZ @.s2316	
 r*   c                      / n S HF  nS H=  nS H4  nS H+  nUS::  a  SOSnU R                  [        UUUS.UUS	95        M-     M6     M?     MH     U $ )
N)r=      r         )   rp   )rp   @   )rp   r         r   r=   r   r9   r:   r;   rJ   r$   )rK   r   )rO   rJ   block_mblock_kblock_nr$   s         r   get_configs_io_boundr      sm    G%
G#1G%,]INN+2+2+2
 (2&/
  2 $   &  Nr*   c                     [        SSSS.SSS9[        SSSS.SS	S9[        SSSS.SS	S9[        SS
SS.SSS9[        S
SSS.SSS9[        SSSS.SSS9[        SSSS.SS	S9[        SSSS.SS	S9[        SS
SS.SSS9[        S
SSS.SSS9[        SSSS.SSS9[        S
SSS.SSS9[        SSSS.SS	S9[        SSSS.SSS9[        SSSS.SS	S9[        SSSS.SSS9[        SS
SS.SSS9[        S
SSS.SSS9[        SSSS.SSS9[        SS
S
S.SSS9[        S
SS
S.SSS9[        SSS
S.SSS9[        S
SS
S.SSS9/[        5       -   $ )Nr   r   r=   r   r   r   rp   r   r.   r   r   r   )r   r   rc   r*   r   get_autotune_configr      sk    	<	

 	;	

 	;	

 	r:	

 	sr:	

 	;	

 	;	

 	;	

 	r:	

 	sr:	

 	r:	

 	2"<VWX<	

 	<	

 	<	

 	<	

 	s;	

 	ss;	

 	<	

 	r:	

 	sr:	

 	r:	

 	2"<VWX[n\ 	]n nr*   rR   .c           	          [         R                  " U R                  5       R                  5        VVs0 s H  u  pX4S j_M     snn5      $ s  snnf )Nc                     U$ rj   rc   )args_vs     r   rD   )_heuristics_from_config.<locals>.<lambda>p  s    Rr*   )triton
heuristics
all_kwargsrL   )rR   rX   rY   s      r   _heuristics_from_configr   o  sA    VEVEVEXE^E^E`aE`TQa"77E`abbas   A
c                  *    [        [        SSSS9SSS9$ )Nr   r   rp   r   r.   r   r$   rJ   r   dictrc   r*   r   _cce_forward_best_configr   s      $sC<VWXXr*   c                      [         (       a.  [        R                  " [        5       / SQ[        [
        SS.S/S9$ [        [        5       5      $ )Nrf   rg   B_BINr@   r`   
perf_modeltop_kLSE)rO   rF   prune_configs_byrestore_value)r   r   autotuner   r`   r   r   r   rc   r*   r   cce_forward_autotuner   w  sG    y')#&82
 !'	
 		
 ''?'ABBr*   c                 `    SU -  U-  U-  SU -  U-  -   SSU -  U-  U-  SU -  U-  U-  -   -  -   $ )Nr=   r   rr   rc   rd   s      r   _bw_total_ops_fnr     sH    q519q=1q519$sa!eai!ma!eai!m.K'LLLr*   c                 2    SXP-  U-  U-  XB-  U-  U-  -   -  $ )Nrr   rc   rk   s         r   _bw_total_store_fnr     s*    )-!#f,y}q/@6/IIJJr*   c                  *    [        [        SSSS9SSS9$ )Nr   rp   r   r   r   r   rc   r*   r   _cce_backward_best_configr     r   r*   c                      [         (       a^  [        R                  " [        5       / SQ[        R
                  " [        SS9[        R
                  " [        [        [        S9SS.SS/S	9$ [        [        5       5      $ )
Nr   g       @)r5   )rx   ry   r   r   dEdC)rO   rF   r   reset_to_zero)r   r   r   r   	functoolspartialr`   r   r   r   r   r   rc   r*   r   cce_backward_autotuner     sq    y')#&/&7&7&S' (//(!1#5
 
  ,
 	
" ''@'BCCr*   c                  (    [        [        SSS9SSS9$ )Nr   r   r9   r;   r   r   r   r   rc   r*   r   _indexed_dot_best_configr     s    $sC0B1MMr*   c                      [        [        SSS9SSS9[        [        SSS9SSS9[        [        SSS9SSS9[        [        SSS9SSS9[        [        SSS9SSS9/$ )Nr   r   r   r   r.   r   r   r   rc   r*   r   _indexed_dot_all_configsr     s     	
 	 	
 	 	
 	 	
 	 	
C) )r*   c                      [         (       a!  [        R                  " [        5       SS/S/S9$ [	        [        5       5      $ )Nrg   r   Out)rO   rF   r   )r   r   r   r   r   r   rc   r*   r   indexed_dot_autotuner     s;    y,.g '
 	
 ''?'ABBr*   )/r   rM   ostypingr   r/   r   r   r   triton.runtimer   r   triton.testingr   r	   r
   r   getenvr   bool__annotations__	lru_cacher   r)   r,   r4   floatintr`   rh   rn   r   r   r   
Heuristicsr   r   	Autotunerr   r   r   r   r   r   listr   r   rc   r*   r   <module>r      s     	     ,  ))NC0C7	4 7 T TE #& $R  	R
 :Rj&e & "DN*odcF cxY=Q=Q8Q/R cY& YChsI,?,?)BVBV,V'VW C M MKY6 YDxY-@-@9CWCW-W(WX D,N& N*$v, *ZChsI,?,?)BVBV,V'VW Cr*   