
    h6                         S SK r S SKrS SKrS SKJr  S SKJr  S SKJrJ	r	J
r
Jr  \ R                  S 5       rS rS rS r SS	 jrS
 rg)    N)cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                       [        S/5      S   S-  $ ! [         aF    SS Kn U R                  5         U R	                  S5      nU R                  XR                  5      S-  s $ f = f)Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r   handles     _/home/james-whalen/.local/lib/python3.13/site-packages/bitsandbytes/triton/matmul_perf_model.pyget_clock_rate_in_khzr      sk    To&'*S00 T2215//8L8LMPSSSTs    AA$#A$c                     U[        US5      -  n[        R                  R                  R	                  U 5      S   S-  n[        XT5      U-  [        U[        5       U 5      -  nU$ z!return compute throughput in TOPS   multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopss          r   get_tensorcore_tflopsr#      sk    SA..K==&&<<VDE[\_``LL&
	
#E+@+BF
K	L 
 M    c                     U[        US5      -  n[        R                  R                  R	                  U 5      S   S-  n[        XT5      U-  [        U[        5       U 5      -  nU$ r   )r   r   r   r   r   r   r   r   s          r   get_simd_tflopsr&   +   sf    SA..K==&&<<VDE[\_``LL&58KEShSjlr8ss  Mr$   c                     [         R                  R                  U 5      nUS   S:  a   U[         R                  :X  a  [	        XX#5      $ [        XX#5      $ )Nr      )torchcudaget_device_capabilityfloat32r&   r#   )r   r   r   r   
capabilitys        r   
get_tflopsr.   5   sG    11&9J!}qUemm3vBB 9DDr$   c                    [         R                  R                  5       nUR                  nUR	                  5       n[        XX5      n[        Xi5      nUnUU-  U-  n[        XX5      [        Xi5      peSU-  U-  U-  S-  n[        UUX5      nUU-  n[        R                  R                  R                  U5      S   n[        SUU-  5      n[        SUS-  5      n[        [        SUS-
  S-  5      S5      n[        U5      US-  US	-  -   -  nUS
-  nXW-  U-  SSUS-
  -  -   -  nXW-  U-  S-  US-
  -  nXg-  U-  SSUS-
  -  -   -  n Xg-  U-  S-  US-
  -  n!UU -   S-  n"UU!-   S-  n#U"U-  U#U-  -   n$US-  n%XV-  U-  U-  S-  n&US:X  a  U&U%-  n'OU%n(U&U(-  n'XV-  S-  S-  U%-  n)U'U)-  n'[        UU$5      U'-   n*U(       a  [        SU* SU SU$ SU' SUS-   S35        U*$ )zCreturn estimated running time in ms
= max(compute, loading) + store   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r)   r*   current_devicer   element_sizer   maxr.   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_mss+                                              r   estimate_matmul_timera   <   s|   & ZZ&&(FGGE^^FQ IQ II9$y0H q?COq A	A!34Ifh	9DT!J ]]  66v>?UVF1h/0q(R-0s1x"}&BCQGF#';d'BEY\`E`'`aGaKE%&.Ay1}(=$=>K$	A6I%&.Ay1}(=$=>K$	A6I+<JI%+6H7"X%55G }H56>G+{;L!|(*	)+%!){+h6G
G,x7M=/);J< H$I%5hZ @.45Q8	

 r$   c                   ^ [         R                  R                  5       n[         R                  R                  5       nUS   R	                  5       nUS   R
                  n/ nU  H|  nUR                  n	U	S   U	S   U	S   UR                  4u  pp[        R                  R                  R                  U5      S   nX-   U-  U-  U-  nX::  d  Mk  UR                  U5        M~     Un U[         R                  [         R                  4;  a&  U  Vs/ s H  oR                  S   S:X  d  M  UPM     n n0 nU  Hg  nUR                  n	U	S   U	S   U	S   U	S   UR                  UR                  4u  pnnnnXUUU4nUU;   a  UU   R                  X45        M`  X4/UU'   Mi     / nUR!                  5        H  u  nnUu  pnnnUS   S	:  a\  X-  U-  S
-  nU[#        SU5      -  S	-  nSnUU-  m[$        R&                  " SUU4S jS9nU H  nUR                  US   5        M     Mr  US   S   nSUl        UR                  U5        M     U$ s  snf )Nr;   rA   rB   rC   max_shared_memrD   r1   r   r(   i   r   i,  r0   c                 T   > U S   T-
  S:  a  S[        U S   T-
  5      -   $ U S   T-
  $ )Nr1   r   
   )abs)xoptimal_num_stagess    r   <lambda>$early_config_prune.<locals>.<lambda>   sE    !11Q6 QqT$6677 31 223r$   )key)r)   r*   r6   r+   r7   r   rF   r:   r   r   r   r   appendfloat16r,   r   itemsr   heapq	nsmallest)configs
named_argsrF   r   r-   rG   r   pruned_configsconfigkwrA   rB   rC   r:   max_shared_memoryrequired_shared_memoryconfigs_maprD   r   rk   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrh   s                               @r   early_config_pruner      s   ZZ&&(F113J_))+FsO!!E N]]yMyMyM	1
-' #MM//EEfMN^_")"3w!>!Kf!T!6!!&)  G U]]EMM22(/Qf==3Kq3P6Q K]]yMyMyMyME
A'7Iz '9=+##V$89!' 45K " N!!#1895'7Ia=A$w.+>DAy 11A5J N!/*!< ooG %%ad+  aDGM'(M$!!-05 $6 c Rs   I	(I	)F)	functoolsro   r)   tritonr   triton.runtimer   triton.testingr   r   r   r   	lru_cacher   r#   r&   r.   ra   r    r$   r   <module>r      s^        !  T T	E* IXKr$   