
    hT              !          S SK r S SKJr  S SKrS SKJr  S SKJrJrJ	r	J
r
Jr  S SKJrJrJr  S SKJr  \ R$                  " SS5      rS	S	SSSSSSSS
.	S jr        S)S jrS	S	SSSSSSS.S\R,                  S\R,                  S\R,                  S\\R,                     S\\R,                     S\\R,                     S\S\\\\   \\   \\   4      S\\   4S jjr\" 5       (       a  S SKrS SKJr  \R@                  S\RB                  S\RB                  S\RB                  S\RB                  S\RB                  S\RB                  S\RB                  S\RB                  S \RB                  S!\RB                  S"\RB                  S#\RB                  S$\RB                  S%\RB                  S&\RB                  S'\RB                  4 S( j5       r"gSr"g)*    N)Optional)	warn_once)broadcast_batch_dimslaunch_kernelprepare_inputsptr_stride_extractortile_to_blocksize)get_metaminimizeupdate)
has_tritonBSR_AUTOTUNEF   )	betaalpha
left_alpharight_alphaoutstoreverboseforceopnamec       	           ^^^^#^$ SSK m$Uc  SnUR                  S   nUR                  5       nUR                  5       nUR	                  5       S-
  nUR                  XS-    u  nnUR                  US-   US-    u  nn[        SSS[        UU-  S5      S	9n[        SUR                  5       U-  U-  UU-  -  -
  S5      nUR                  nUc  UnOUR                  nUUL a  UnOUU4nSUU4nUUUUUTS:H  TS:H  US:H  4n[        UUUS
S9nUc  Sn[        UUSUS4S
S9nUc  UnOU
(       d  U$ S
nXX$U4UUUU$4S jjnUUUUU4S jn[        UUUUSU	S9u  m#nn n!U	(       a  [        SU! SUS SU S S35        U(       a[  U(       a  T#U:X  a  UULdI  [        R                  R                  5       n"[!        UU"UU[#        U#4S j[%        T#5       5       5      5        T#$ )zTune bsr_dense_addmm kernel parameters against the given inputs.

When store is True, the tuning results will be stored in the
database of kernel parameters.
r   Nbsr_dense_addmmr            )GROUP_SIZE_ROW
num_stages	num_warpsSPLIT_NT)versionexactF      ?c           	      b   >^ ^^^^^ UUUUUUU UU	4	S jnT
R                   R                  USSS9$ )Nc                  &   >	 [        TTTTT TTTTS9	$ )N)r   r   r   r   metar   )r   )	r   r   bsrdenseinputr   r(   r   r   s	   W/home/james-whalen/.local/lib/python3.13/site-packages/torchao/kernel/bsr_triton_ops.py	test_func6tune_bsr_dense_addmm.<locals>.bench.<locals>.test_func_   s+    "%'
 
    i  d   )warmuprep)testingdo_bench)r(   r+   r)   r*   r   r   r-   r   r   r   tritons   `````` r,   bench#tune_bsr_dense_addmm.<locals>.bench^   s+    	 	 ~~&&y#&FFr/   c	                 D   U S;   n	[        SSSSS9U    n
[        [        XW-  S5      S9R                  U 5      n[        SSSSS9U    nU	(       a  US:  a  XU-  -  OX[        U5      -  -  nOXU-  -   nU
b  [        X5      nUb  [	        X5      nU S:X  a
  X]-  S:w  a  U$ U$ )N>   r"   r!   r   )r"   r!   r    r   )r"   r   r   r"   )dictmaxgetabsmin)namevalue	directionr(   MNKBMBKis_log	min_value	max_value
value_step
next_values                 r,   step_meta_parameter1tune_bsr_dense_addmm.<locals>.step_meta_parametero   s     11aAaPQUV	QWa155d;	!qQqQRVW
 q= I--S^;<  i!77J Z3J Z3J91!4Lr/   )max_stepr   z-> z
, speedup=z.1fz %, timing=z.3fz msc              3   .   >#    U  H
  nTU   v   M     g 7fN ).0kr(   s     r,   	<genexpr>'tune_bsr_dense_addmm.<locals>.<genexpr>   s     0<a$q'<s   )r5   shapevaluescrow_indicesdimr9   r:   round_nnzdtyper
   r   printtorchcudaget_device_namer   tuplesorted)%r+   r)   r*   r   r   r   r   r   r   r   r   r   rB   rV   rW   
batch_ndimrA   rC   rD   rE   reference_metasparsityr[   	out_dtypeversion_dtyper#   keyinitial_metamay_skip_updater6   rK   speeduptimingsensitivity_messagedevice_namer(   r5   s%      ` ``                            @@r,   tune_bsr_dense_addmmrn      s@   ( ~"BAZZ\F##%L!!#a'J99Zq.1DAq\\*q.:>:FB Q!Sb!_N
 Qb2-Q77;HIIE
{	II	E	*-*GaBDAItqy%1*
=C FCELa_DQ)L   G G" =>aBSU 2 2:2.D'6. '(7C.VSMMNDL0\5Wjj00206$<00	
 Kr/   c                    Uc  [         R                  nUc  UnUc  SnXyX1S 1:X  GaQ  [         R                  R                  5       nXX#XES:H  US:H  US:H  4nXL a  UnOX4n[	        SUUUUU4S9nUc  US:w  a  [	        SUUUUS4S9nUc  XLa  [	        SUUXS4S9nUc  [	        S/ US S QSPUSS  Q7UUUS4S9nUc  XLa  [	        S/ US S QSPUSS  Q7UXS4S9n[        U=(       d    0 5       H=  nUU   nUS   nUS	   nUU-  nUU-  S:X  d  M"  UU::  d  M*  [        U5      nUU-  US	'   M?     Ub  UR                  " S0 UD6  U$ [        S
U < SU< SU< SU< SU< SU< SU< SU< SU< S35        U=(       d    [        X#-  S5      nU=(       d    SnU
=(       d    Sn
U	=(       d    Sn	[        SUUU
U	S.UD6$ )Nr%   r   r   r   )r#   r   *r   r"   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=zC. To find optimal triton kernel parameters, run with BSR_AUTOTUNE=1r   )r"   r   r    r!   rP   )
r]   float16r^   r_   r
   ra   r9   r   r   r:   )rA   rC   rB   MsKsr   r   r"   r   r!   r    rd   r[   re   _versionextrarm   rg   rf   r(   matching_metamkeymeta_nsplit_ncs                             r,   bsr_dense_addmm_metar|      s   * }	J7D6Ajj002QB	419eqjA!M!,M}h7	
 <HO!!=#6	D <E2!3hs=SD <$!)#bq')3)QR)!=#6	M $)? (%-c"1g-s-SW-%c2	! }23%d+G	*Lq5A:!q&;D&'1fDO 4 KK % Kt4QD!Ure6bU'D7(E88UHLi\ ZTT (QWaG#(qNqJQI %	
  r/   )r   r   r   r   r   skip_checksmax_gridr(   r+   r)   r*   r   r   r   r}   r~   r(   c                d	  ^^^
^ ^!^"^#^$^% SnUR                  5       nUR                  5       nUR                  5       nUR                  5       S-
  nUR                  XS-    u  nnUR                  US-   US-    nUR                  S   n[        XU5      nUc  UR                  UUU4-   5      nUR                  5       S:X  d  TS:X  d  US:X  d  US:X  d  US:X  aB  TS:X  a  UR                  5         U$ UR                  U 5        TS:w  a  UR                  T5        U$ T
c{  [        SUR                  5       US   -  US   -  UU-  -  -
  S5      n[        (       a  [        U UUTTUUUSS	SSS
9m
O,[        UUUUS   US   TTUUR                  UR                  S9
m
S	m$S	m%Uc(  Sm$UR                  S5      R                   " / UQUPUP76 nO*UR"                  " / UQUPSP76 R                   " / UQUPUP76 nUc(  Sm%UR                  S5      R                   " / UQUPUP76 nO*UR"                  " / UQSPUP76 R                   " / UQUPUP76 nUR%                  5       S   S:X  d   eUR%                  5       S   S:X  d   eUn['        XX%Xg5      u  nnnn nnnnUu  m!m T
R)                  S[+        UT!-  S5      5      nUU-  m"Un[-        UT!T"45      n[-        UT T"45      n[-        U T!T"45      n [-        UT!T"45      n[-        UT!T"45      n[.        R0                  [2        R4                  [.        R6                  [2        R4                  [.        R4                  [2        R8                  [.        R8                  [2        R8                  [.        R:                  [2        R<                  [.        R<                  [2        R<                  0UR                     m#UR?                  S5      nUR?                  S5      S-
  nUR?                  S5      nUUU4nU	b*  [A        U	SS SSS2   5      SS[C        U	SS 5      -
  -  -   nOSnUSUSUSU SUSUSUSUS0nTS:w  d   eU U!U"UUU#U$U
U%4	S jn[E        UUUU5        URG                  5       URG                  5       :w  a*  UR                  UR#                  UR                  5      5        U$ )zCompute

  out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

where left_alpha, right_alpha are (* + 1)-D tensors when
specified, otherwise, these are treated as tensors filled with
ones.
r   r   r   r   r   Nr   TF)	r   r   r   r   r   r   r   r   r   )rd   r[   re   rP   r"   rO   )r   NN)r   Nr   )r   r   )r   r   Nc                    >	 [         U    " / [        U6 QTPTP7TS:H  TS:g  TS:H  TT
TTTT[        R                  :H  TS.
T	D6  g )Nr   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COL
allow_tf32	acc_dtype)_bsr_strided_addmm_kernelr   tlfloat32)gridsliced_tensorsrE   rD   BNr   r   dot_out_dtyper   r(   r   s     r,   kernelbsr_dense_addmm.<locals>.kernel  ss    !$' 	
!>2	
	
 	
 	 AI!/1$

2#	
 	
r/   )$rV   rW   col_indicesrX   rU   r   	new_emptyrZ   zero_copy_mul_rY   AUTOTUNErn   r|   r[   expandviewstrider   r;   r:   r	   r]   rq   r   r   bfloat16float64int8int32sizer`   lenr   data_ptr)&r+   r)   r*   r   r   r   r   r   r}   r~   r(   f_namerV   rW   r   rb   rA   rC   	blocksizerB   original_batch_dims_broadcastedrd   
out_backupr"   out_untiled	n_batchesn_block_rowsn_block_cols	full_gridgrid_blockstensor_dims_mapr   rE   rD   r   r   r   r   s&      ``     `                     @@@@@@r,   r   r     s    , FZZ\F##%L//#K!!#a'J99Zq.1DAqZ!^j1n=IBA&:6&N#
{oo=AFG
xxzQ%1*Q!q&AF19IIK
 
 IIeqy
|SXXZ)A,61EQOOQRS8'%'(D (!!!kk))D  __R(// 
,
./
12

  __L&ELqL!LSS 
,
./
12

 !oob)00 
,
./
12
 "&&N(GNNANUU 
,
./
12
 r"a'''#q(((J 	s5kG	 FBhhy#a2gq/2G	
gBK
C"b
*Ceb"X.Eeb"X.E":Bx8J#K"b:K 	rzz

rzzrzz

BHHRXX 
iiM 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}K[[	O A::
 
$ &/9kB
||~,,.. 	))**:*:;<r/   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_strider   r   r   r   r   r   r   r   r   r   r   r"   c7           	         US:X  d   eUS:X  d   eUS:X  d   eU!S:X  d   e[         R                  " SS9n7[         R                  " SS9n8[         R                  " SS9n9[         R                  " SS9n:[         R                  " SS9n;[         R                  " U8U9U:U;U55      u  n8n9UUU7-  -   UU8-  -   n<[         R                  " U<5      n=[         R                  " U<U-   5      n>U>U=-
  n?[         R
                  " SU05      n@[         R
                  " SU25      nAU1S:  d	  U1S-  S:w  a  SnBOU1nB[         R
                  " SWB5      nCU UU7-  -   UU=-  -   UW@S S 2S 4   -  -   UWAS S S 24   -  -   nDUUU7-  -   UU9-  -   UUAS S 2S 4   -  -   UUCS S S 24   -  -   nEU#U$U7-  -   U%U8-  -   U&U9-  -   U'U@S S 2S 4   -  -   U(UCS S S 24   -  -   nFUU	U7-  -   U
U=-  -   nG[         R                  " U0UB4U3S9nH[        U?5       Hx  nI[         R                  " WD5      nJ[         R                  " WG5      nK[         R                  " WEUUK-  -   WCS S S 24   U1:  S9nLWH[         R                  " UJULU4U3S9-  nHUDU-  nDUGU
-  nGMz     U-(       d  WHU*-  nHU.(       dG  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWCS S S 24   -  -   nMWH[         R                  " UM5      -  nHU/(       dG  UUU7-  -   UU8-  -   U U9-  -   U!W@S S 2S 4   -  -   U"WCS S S 24   -  -   nNWH[         R                  " UN5      -  nHU,(       ak  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWCS S S 24   -  -   nOU+(       a  WH[         R                  " WO5      -  nHOWHU)[         R                  " WO5      -  -  nH[         R                  " WFWHR                  U#R                  R                  5      WCS S S 24   U1:  S9  g )	Nr   r   )axisr      )r[   )mask)r   re   )r   
program_idnum_programs	swizzle2dloadarangezerosrangedotr   tor[   
element_ty)P
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stride	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_stride	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_strideleft_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_strider   left_alpha_row_block_strider   right_alpha_ptrright_alpha_batch_strider   right_alpha_tiled_col_strider   right_alpha_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider   r   r   r   r   r   r   r   r   r   r   r   r   r"   	batch_pidrow_block_pidcol_block_pidr   r   crow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangeinner_block_arangePADDED_BLOCKSIZE_COLcol_block_arangevalues_block_ptrsdense_block_ptrsoutput_ptrscol_index_nnz_ptroutput_acc_block_values_blockdense_row_idxdense_blockleft_alpha_ptrsright_alpha_ptrs
input_ptrssP                                                                                   r,   r   r     s   V +a///*a///+q000+q000MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:2!3q!813 1> 99Q(<= !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 8801
 wA77#45L GG$56M'' #9M#II%dAg.>K kjI! 
 !22!33%  ( % )I56-=> .=> .0@D0II	J
 .0@q0IIJ   88!*Y67.>? />? /1A!T'1JJ	K
 /1A$'1JJK  (8 99$y01(=89 )=89 )+;AtG+DD	E
 )+;D!G+DDE   BGGJ$77  D277:+>$>>  	
 0 0 ; ;<!$'*]:	
r/   )NNNNNNNr   )#ostypingr   r]   torch._dynamo.utilsr   torch.sparse._triton_opsr   r   r   r   r	   torch.sparse._triton_ops_metar
   r   r   torch.utils._tritonr   getenvr   rn   r|   Tensorboolr`   intr9   r   r5   triton.languagelanguager   jit	constexprr   rP   r/   r,   <module>r     s   
   )  E D *99^U+ 



E` 
^L 

)-*."&MQu<<u	u <<u &u %,,'u 
%,,	u u uXc]HSM8C=HIJu 4.up << ZZU
L &(\\MU
P &(\\QU
Z ')ll[U
^ ')ll_U
x \\yU
z {U
| ll}U
~ <<U
@ LLAU
B ||CU
D ||EU
F GU
H <<IU
J LLKU
L MU
N OU
 U
p !%r/   