
    h//                        S SK r S SKrS SKrS SKJr  S SKJr  \" \ R                  " / SQSS9 V VVs/ s HS  u  pnXUSS4XUSS4XUSS4XUSS4XUSS4XUSS4XUSS4XUSS	4XUSS	4XUSS	4XUSS	4XUS
S	4XUSS	4XUSS	4XUS	S	4/PMU     snnn / 5      r
\R                  R                  R                  S:X  aF  \ R                  " / SQSS9 VVVVVs/ s H  u  p4nS  H  nS  H  nX4XVU4PM
     M     M!     snnnnnr
\
 V VVVV	s/ s H  u  pp(n	\R                  XUS	S.UU	S9PM     sn	nnnn r
\R                  S\R                   S\R                   S\R                   S\R                   4S j5       r\R                  \R$                  4S\R                   S\R                   S\R                   S\R                   S\R                   S\R                   4S jj5       rS rS r\R,                  R/                  SS5      r\R3                  S5        \R3                  S 5        \R,                  R5                  \S!S"5      S# 5       r\R,                  R5                  \S!S$5      S% 5       r\R,                  R5                  \S&S"5      S' 5       r\R,                  R5                  \S&S$5      S( 5       rgs  snnn f s  snnnnnf s  sn	nnnn f ))    N)get_best_config_fn)    @            )repeat                     
EXHAUSTIVE)   r   r   r   r   )r
   r   r   r   r   r   r   r   )r   r   r   )BLOCK_MBLOCK_NBLOCK_KGROUP_M)
num_stages	num_warpsr   r   r   r   c           	      6   [         R                  " SS9n[         R                  " X<5      n[         R                  " XM5      nUU-  nUU-  nUU-  n[        UU-
  U5      nUUU-  -   nUU-  U-  n[         R                  " U X54Xg4UU-  S4X4SS9n[         R                  " UXT4X4SUU-  4X4SS9n[         R
                  " X4[         R                  S9n[        SX^5       Hy  n[         R                  " USS9n[         R                  " USS9nU[         R                  " UU5      -  n[         R                  " USU45      n[         R                  " UUS45      nM{     Un[         R                  " UX44X4UU-  UU-  4X4SS9n[         R                  " UUSS9  g)	ziKernel for computing the matmul C = A x B.
A has shape (M, K), B has shape (K, N) and C has shape (M, N)
r   )axis)r
   r   )baseshapestridesoffsetsblock_shapeorderdtype)r   r
   )boundary_checkN)tl
program_idcdivminmake_block_ptrzerosint32rangeloaddotadvancestore) a_ptrb_ptrc_ptrMNK	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnr   r   r   r   pid	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mpid_mpid_na_block_ptrb_block_ptraccumulatorkabcc_block_ptrs                                    U/home/james-whalen/.local/lib/python3.13/site-packages/torchao/kernel/intmm_triton.py!matmul_kernel_with_block_pointersrM   M   s   @ --Q
C#I#I*&&HW$K)k)73G3=)E##/E ##f&!$&K ##f&EGO$&K ((G-RXX>K1a! GGK7GGK7rvva|# jjq'l;jjwl; " 	A
 ##f&%'/2&K HH[!F3    EVEN_KACC_TYPEc                    [         R                  " S5      nXO-   S-
  U-  nUU-   S-
  U-  nUU-  nUU-  n[        UUU-  -
  U5      nUU-  UU-  -   nUU-  U-  nUU-  [         R                  " SU5      -   nUU-  [         R                  " SU5      -   n[         R                  " [         R
                  " UU-  U5      U5      n[         R                  " [         R
                  " UU-  U5      U5      n [         R                  " SU5      n!U US S 2S 4   U-  U!S S S 24   U-  -   -   n"UU!S S 2S 4   U	-  U S S S 24   U
-  -   -   n#[         R                  " UU4US9n$[        USU* 5       H  n%U(       a-  [         R                  " U"5      n&[         R                  " U#5      n'O@[         R                  " U"U!S S S 24   U%:  SS9n&[         R                  " U#U!S S 2S 4   U%:  SS9n'U$[         R                  " U&U'5      -  n$U"UU-  -  n"U#UU	-  -  n#M     UU-  [         R                  " SU5      -   nUU-  [         R                  " SU5      -   nUS S 2S 4   n(US S S 24   n)U(U:  U)U:  -  n*U)UU(-  -   n+[         R                  " U[         R                  " U(U*R                  5      -   U*SS9n,[         R                  " U[         R                  " U+U*R                  5      -   U$U,-  U*5        g )Nr   r
   r!   g        )maskother
evict_last)eviction_policy)r$   r%   r'   arangemax_contiguousmultiple_ofr)   r+   r,   r-   broadcast_tor   r/   )-r0   r1   r2   s1_ptrr3   r4   r5   r6   r7   r8   r9   r:   r;   
stride_s1m
stride_s1nr   r   r   r   rO   rP   r<   grid_mgrid_nwidthr@   
group_sizerB   rC   rmrnramrbnrkABaccrG   rH   rI   idx_midx_nrR   xindextmp0s-                                                rL   (scaled_matmul_kernel_with_block_pointersrm      s   > --
CkAo')F'kAo')F fEe|HVh00':Jw#
"23E5[j)E	299Q0	0B	299Q0	0B


BNN267;W
EC


BNN267;W
EC	1g	BQW	)BtQwK),CCDAAtGy(3tQw<)+CCDA
((GW%X
6C1a'"
A
A47as;A1d7as;Arvva|	Wy  	Wy   # 
299Q0	0B	299Q0	0Bq$wKEtQwKEAI%!)$D a%i F77"//%45$D
 HHUboofdjj9:C$JMrN   c                   ^^ U R                   u  mnUR                   u  nmUU4S jn[        U   " U UUTTUU R                  S5      U R                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      4UR                  UR                  UR
                  S.UR                  D6  U$ )Nc                 p   > [         R                  " TU S   5      [         R                  " TU S   5      -  4$ Nr   r   tritonr&   METAr3   r4   s    rL   <lambda>#int_matmul_kernel.<locals>.<lambda>  .    AtI'&++ai*IIrN   r   r
   )r   r   num_ctas)r   rM   strider   r   rx   kwargs)rH   rI   rJ   configr5   gridr3   r4   s         @@rL   int_matmul_kernelr}      s    77DAq77DAqD &d+												 ""$$  --!$ HrN   c                   ^^ U R                   u  mnUR                   u  nmUU4S jn[        U   " U UUUTTUU R                  S5      U R                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      4UR                  UR                  UR
                  US-  S:H  S.UR                  D6  U$ )Nc                 p   > [         R                  " TU S   5      [         R                  " TU S   5      -  4$ rp   rq   rs   s    rL   ru   *int_scaled_matmul_kernel.<locals>.<lambda>!  rw   rN   r   r
   r   )r   r   rx   rO   )r   rm   ry   r   r   rx   rz   )	rH   rI   scales1rJ   r{   r5   r|   r3   r4   s	          @@rL   int_scaled_matmul_kernelr     s    77DAq77DAq
D -T2												qq  ""$$A
'( --), HrN   torchaoFRAGMENTz(int_matmul(Tensor a, Tensor b) -> Tensorz?int_scaled_matmul(Tensor a, Tensor b, Tensor scales1) -> Tensor
int_matmulMetac                     U R                   u  p#UR                   u  p4[        R                  " X$4U R                  [        R                  S9$ Ndevicer"   )r   torchemptyr   r*   )rH   rI   r3   r5   r4   s        rL   int_matmul_metar   B  s5    77DA77DA;;vahhekkBBrN   CUDAc                 ^   U R                   S   UR                   S   :X  d   S5       eU R                   u  p#UR                   u  p4[        R                  " X$4U R                  [        R                  S9n[        [        XU/[        5      nUc  [        R                  " / 5      $ [        XXV5      $ Nr
   r   zIncompatible dimensionsr   )	r   r   r   r   r*   r   r}   int8_mm_kernel_configstensor)rH   rI   r3   r5   r4   rJ   best_configs          rL   int_matmul_cudar   I  s     771:#>%>># 77DA77DAQF1885;;?A$A!9&<K ||BQ122rN   int_scaled_matmulc                     U R                   u  p4UR                   u  pE[        R                  " X54U R                  UR                  S9$ r   )r   r   r   r   r"   )rH   rI   r   r3   r5   r4   s         rL   int_scaled_matmul_metar   ]  s5    77DA77DA;;vahhgmmDDrN   c                 &   U R                   S   UR                   S   :X  d   S5       eU R                   u  p4UR                   u  pE[        R                  " X54U R                  UR                  S9n[        [        XX&/[        5      n[        XX&U5      $ r   )r   r   r   r   r"   r   r   r   )rH   rI   r   r3   r5   r4   rJ   r   s           rL   int_scaled_matmul_cudar   d  s     771:#>%>># 77DA77DAQF1887==AA$ 1"46LK $A'kBBrN   )	itertoolsr   rr   triton.languagelanguager$   torchao.kernel.autotunerr   sumproductr   	_inductorr{   max_autotune_gemm_search_spaceConfigjit	constexprrM   r*   rm   r}   r   libraryLibrarylibdefineimplr   r   r   r   )
ijrG   r   r   r   r   r   sws
   0000000000rL   <module>r      s       7 ( #**+=aH'& IIQ1! 1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO1aO	
" I'* - 2 	??88LH *3):):"1*
*
%Gg 3J"I 
7	:
 # 	; 3	 	;*
@ 2 2qQ MMa@  
 2  _4& \\'_4( \\)_4* \\+_4, \\-_4 _4D 8  XX7KN, \\-KN. \\/KN0 \\1KN2 \\3KN4 LL5KN6 ll7KN KN\6 F mmIz2 

5 6 

L M Cv.C /C Cv.3 /3& C,f5E 6E C,f5C 6Ck
24s   AJ0&J7#K 