
    ΅id             !          S SK r S SKrS SKrS SKJr  S SKrS SKJr  S SKJ	r	  SSK
Jr  \" \R                  " SS5      5      rS	 rS
 rS rS rS rS rS rS rS rS rS rS rSqS jrS rS rS rS rSS.S jr       SrS jr!        SsS jr" " S S5      r#\" \S 9S! 5       r$ StS" jr%SuS# jr&SSSSSS$SSS%.S&\RN                  S'\RN                  S(\RN                  S)\RN                  S-  S*\RN                  S-  S+\RN                  S-  S,\(S-\)\S-  \S-  \S-  4   S-  S.\*S-  4S/ jjr+SSSSSS$SSS%.S&\RN                  S'\RN                  S(\RN                  S)\RN                  S-  S*\RN                  S-  S+\RN                  S-  S,\(S-\)\S-  \S-  \S-  4   S-  S.\*S-  4S0 jjr,\	" 5       (       Ga&  S SK-r-S SK.J/r0  \-Rb                  S1\0Rd                  S2\0Rd                  S3\0Rd                  S4\0Rd                  S5\0Rd                  S6\0Rd                  4S7 j5       r3\-Rb                  S2\0Rd                  S3\0Rd                  S5\0Rd                  S6\0Rd                  S8\0Rd                  4
S9 j5       r4S: r5S;S;SS$SS<.S&\RN                  S=\RN                  S>\RN                  S+\RN                  S-  S,\(S-\)\S-  \S-  \S-  4   S-  4S? jjr6SS$SSS@.S'\RN                  S(\RN                  S+\RN                  S-  S,\(S-\)\S-  \S-  \S-  4   S-  S.\*S-  4SA jjr7\-Rb                  SB\0Rd                  SC\0Rd                  4SD j5       r8SqSE jr9   SvSF\RN                  SG\RN                  SH\RN                  SI\RN                  S-  SJ\:SK\(SL\:S-  4SM jjr;\-Rb                  SN\0Rd                  SO\0Rd                  SP\0Rd                  SQ\0Rd                  SR\0Rd                  SS\0Rd                  S6\0Rd                  4ST j5       r<SU\RN                  SV\RN                  SW\RN                  SX\RN                  SY\RN                  4
SZ jr=\-Rb                  S[\0Rd                  S\\0Rd                  SQ\0Rd                  S]\0Rd                  SR\0Rd                  SS\0Rd                  S^\0Rd                  S6\0Rd                  4S_ j5       r> SwSU\RN                  SV\RN                  S`\RN                  Sa\RN                  Sb\RN                  Sc\RN                  S.\*SY\RN                  Sd\(4Se jjr?\-Rb                  Sf\0Rd                  Sg\0Rd                  Sh\0Rd                  Si\0Rd                  Sj\0Rd                  Sk\0Rd                  Sl\0Rd                  Sm\0Rd                  Sn\0Rd                  S2\0Rd                  S3\0Rd                  So\0Rd                  S5\0Rd                  S6\0Rd                  S8\0Rd                  S]\0Rd                  4 Sp j5       r@gSr9Sr7Sr6Sr;Sr=Sr?Sr@g)x    N)	lru_cache)	warn_once)
has_triton   )get_meta*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                 (    U (       d  [        U5      eg N)
ValueError)condmsgs     R/home/james-whalen/.local/lib/python3.13/site-packages/torch/sparse/_triton_ops.pycheckr      s    o     c                 X    [        UR                  [        R                  :H  U  S35        g )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s'    		E$$$(RSr   c                 |    [        UR                  U:H  =(       a    UR                  R                  S:H  U  S35        g )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer       s3    		F6qxx}}6(KLr   c           	      .   [        UR                  5       S:  =(       a    UR                  5       S:  U  SUR                  5        SUR                  5        S35        UR                  SS  u  p4UR                  SS  u  pV[        XE:H  U  SU SU S35        g )Nr	   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhs_mklkr_ns          r   check_mm_compatible_shapesr)   '   s    		Q)3779>(   #	{*<SWWYKq	J YYrs^FBYYrs^FB	
(   "t#KB4q	Rr   c           	          [        UR                  U:H  =(       aE    UR                  [        R                  [        R                  [        R
                  4[        U6 -   ;   U  SU SUR                   S35        g )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r   )r   dtyper   halfbfloat16floattuple)r   r   r+   additional_dtypess       r   check_dtyper1   8   sp    		5 	SGGZZ5?P8QQS( 33D2E FGG9A	'	r   c           	         ^ [        U5      S:w  a  [        S[        U5       35      eS mU4S jn[        U" U5      U  SUS    SUS    S	35        g )
Nr	   z"blocksize must have length 2, got c                     X S-
  -  (       + $ Nr    )vs    r   is_power_of_two(check_blocksize.<locals>.is_power_of_twoG   s    QK  r   c                 X   > SnU  H   nUS:  =(       a    T" U5      =(       a    UnM"     U$ )NT   r5   )bres	blocksizer7   s      r   is_compatible_blocksize0check_blocksize.<locals>.is_compatible_blocksizeJ   s1    I?Ay'AJsC  
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenAssertionErrorr   )r   r=   r>   r7   s      @r   check_blocksizerB   C   sj    
9~A#i.AQRSS! 
	*(01b1 OD 	Dr   c                 `    [        U R                  5       5      S:  a  U R                  5       $ U $ )a  Return input as a triton-contiguous tensor.

A triton-contiguous tensor is defined as a tensor that has strides
with minimal value smaller than or equal to 1.

While triton kernels support triton-non-contiguous tensors (all
strides being greater than 1) arguments, a considerable slow-down
occurs because tensor data is copied element-wise rather than
chunk-wise. Zero strides is assumed to not have this defect.
r   )minstride
contiguous)r   s    r   make_triton_contiguousrG   X   s)     188: ||~r   c                 v     [         R                  " S U 5       6 $ ! [         a    [        SU  S35         g f = f)Nc              3   >   #    U  H  oR                   S S v   M     g 7fNr    r"   .0r   s     r   	<genexpr>'broadcast_batch_dims.<locals>.<genexpr>m   s     'Fgg   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrT   k   sB    U%%'Fg'FGG UexRSTUs    88c              '   l   #    U H*  n[        S 5      /UR                  5       -  nXU '   X4   v   M,     g 7fr   )slicer!   )r!   slice_rangerS   r   slicess        r   slicerrY   r   s3     +(!si s   24c              '      #    U HL  n[        S 5      /UR                  5       -  n[        XSS9 H  u  pVUc  M
  XdU'   M     U[        U5         v   MN     g 7f)NFstrict)rV   r!   zipr/   )dimsrX   rS   r   sdd_slices          r   multidim_slicerrb   y   sT     4[MAEEG#d59JA}! : ak s
   3AAc               '   Z   #    U  H  nUv   UR                  5        S h  vN   M!     g  N	7fr   )rE   )rS   r   s     r   ptr_stride_extractorrd      s&     88: s   +)
+c           
   #   2  ^ ^^#    [        T 5      S:  d  [        T 5      S:  a  [        S[        T 5       35      e[        T5      S:  d  [        T5      S:  a  [        S[        T5       35      eSS KnU U4S jnU4S jnUR                  " U" 5       6  Hp  n[	        T UTSS9 VVV	s/ s H  u  pxn	[        Xx-
  U	5      PM     n
nnn	[	        XjSS9 VVs/ s H  u  p[        XU-   5      PM     nnnU
S S S	2   /U" U5      Q7v   Mr     g s  sn	nnf s  snnf 7f)
Nr      z"full_grid length must be 0-3, got z$grid_blocks length must be 0-3, got c               3   R   >#    [        TTSS9 H  u  p[        SX5      v   M     g 7f)NFr[   r   )r]   range)fgmg	full_gridgrid_blockss     r   generate_grid_points.grid_partitioner.<locals>.generate_grid_points   s)     )[?FB2"" @s   $'c              3   n   >#    TR                  5        H  u  p[        [        X U5      5      v   M     g 7fr   )itemsnextrb   )rX   r   t_dimstensor_dims_maps      r   generate_sliced_tensors1grid_partitioner.<locals>.generate_sliced_tensors   s-     (..0IAvq9:: 1s   25Fr[   )r@   rA   	itertoolsproductr]   rD   rV   )rk   rl   rs   rw   rm   rt   
grid_pointri   gprj   gridgrX   s   ```          r   grid_partitionerr}      s    
9~S^a/A#i.AQRSS
;!s;/!3CCDTCUVWW#;  '')=)?@
 ")ZUS
S
 S 	 
 25Ze1TU1T%F#1TU 4R4j:26::: A
 Vs   B#D(D

DD.)Dc           	         ^ SS S S2   nUc  UnO S m[        U4S j[        X4SS9 5       5      n[        X#U5       H  tpVU " U/UQ76   M     g )N)i  r   rv   c                 6    U c  U$ [        S[        X5      5      $ r4   )maxrD   )r|   rj   s     r   valid_grid_dim%launch_kernel.<locals>.valid_grid_dim   s    y	 1c!j))r   c              3   8   >#    U  H  u  pT" X5      v   M     g 7fr   r5   )rM   r|   rj   r   s      r   rN    launch_kernel.<locals>.<genexpr>   s"      
F 1!!Fs   Fr[   )r/   r]   r}   )kernelrs   rk   rl   cuda_max_gridr{   sliced_tensorsr   s          @r   launch_kernelr      si    .tt4M#	*  
[F
 

 "2" 	t%n%"r   c           
      <   U R                  5       R                  S5      nU R                  5       R                  S5      n[        U R	                  5       R                  S5      5      nU Vs/ s H  n[        UR                  S5      5      PM     nn[
        R                  " UR                  S S /S U 5       Q76 nS nU" X'S5      nU" X7S5      nU" XGUR                  SS  5      nU Vs/ s H  nU" XWUR                  SS  5      PM     nnX#U/UQ7$ s  snf s  snf )Nr   c              3   >   #    U  H  oR                   S S v   M     g 7frJ   rK   rL   s     r   rN   !prepare_inputs.<locals>.<genexpr>   s     ;7aWWSb\7rP   c                 `    U R                  X-   5      R                  S[        U5      S-
  5      $ )Nr   r   )broadcast_toflattenr@   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squash2prepare_inputs.<locals>.batch_broadcast_and_squash   s/    ~~j9:BBs:"
 	
r   rv   r    )crow_indices	unsqueezecol_indicesrG   valuesr   rQ   r"   )	bsrdense_tensorsr   r   r   r   rS   batch_dims_broadcastedr   s	            r   prepare_inputsr      s,   ##%//2L//#--a0K#CJJL$:$:1$=>F?LM}!%akk!n5}GM #33Sb;7;

 .eL -[RWXK'RS(9F
 A 	#1aggbclK  
 f6w667 N,s   +$D+ Dc                 |   [        X/UQ76 nUR                  5       R                  US-   5      nUR                  5       R                  US-   5      nUR	                  5       R                  X1R	                  5       R
                  SS  -   5      nX1R
                  SS  -   n[        R                  " XEXgUR                  S9$ )Nr   r   r    sizer   )	rT   r   r   r   r   r"   r   sparse_compressed_tensorr   )r   r   rS   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &v=W=K##%22;3FGL//#00u1DEKZZ\&&{ZZ\5G5G5L'LMF23'D))6SZZ r   c                     U R                   Gt p#nUX1S   -  US   XAS   -  US   /-   nU R                  U5      R                  SS5      $ )Nr   r   r   r    )r"   view	transpose)r   r=   restmn	new_shapes         r   tile_to_blocksizer      s^    ''KTa	q\!	q\!	 I 66)&&r2..r   c                    U R                   S:  a#  U R                  S5      n U R                   S:  a  M#  U R                   S:  a  U R                  SU R                   S-
  5      n U R                   S:w  a  [        SU R                   35      eU $ )zReturn tensor as 3D tensor by either prepending new dimensions to
the tensor shape (when ``tensor.ndim < 3``), or by collapsing
starting dimensions into the first dimension (when ``tensor.ndim >
3``).
rf   r   z3tensor should have 3 dimensions after reshape, got )ndimr   r   rA   r"   )tensors    r   	as1Dbatchr      s}     ++/!!!$ ++/{{Q6;;?3{{aA&,,P
 	
 Mr   accumulatorsc                   US   nU R                   S:w  a  [        SU R                    S35      eU R                  u  pVnUS:X  Gak  USS u  pUR                   S:w  a  [        SUR                    S35      eUR                  u  pnX{:w  a  [        S	U S
U S35      eUc?  UR                  S   S-
  n[        R                  " XU4U R
                  U R                  S9nO=UR                  u  pnX:w  a  [        SU SU S35      eX:w  a  [        SU SU S35      eUS-  (       d  US-  (       d  US-  (       d  [        cb  [        UR                  S   S-
  5       HA  nUU   nUUS-      n[        UU5       H!  nU	U   u  nnUU==   U U   UU   -  -  ss'   M#     MC     U$ [        XXU5        U$ US:X  Ga3  UR                  n[        U5      nUR                  u  nnnUU-  S:w  a  [        SU SU S35      eUSS u  nnnnnUS   nUc\  UUR                  5       R                  5       S-   U-  -   n [        R                  " / USS QU PUP7U R
                  U R                  S9nO*UR                  SS u  n n!U!U:w  a  [        SU! SU S35      eUR                  n"[        U5      nUU-  nUS-  (       d  US-  (       d  US-  (       d  [        c  UR                  5         [        U5       H  n#[        UR                  S   5       H  nUU   R                  5       n$UU   R                  5       nUUS-      R                  5       n[        U$U5      u  n%n&UU#U%U%U-   2U&U&U-   24   n'[        UU5       HE  nUU   UU   nn[        UR                  5       U5      u  n(n)U'U U   UU#U(U(U-   2U)U)U-   24   -  -  n'MG     M     M     O[        U UUUUUUU5        UR                  U"5      $ US:X  Gaf  UR                  n[        U5      nUR                  u  nnnUU-  S:w  a  [        SU SU S35      eUSS u  nnnnUS   nUc\  UUR                  5       R                  5       S-   U-  -   n [        R                  " / USS QU PUP7U R
                  U R                  S9nO*UR                  SS u  n n!U!U:w  a  [        SU! SU S35      eUR                  n"[        U5      nUU-  nUS-  (       d  US-  (       d  US-  (       d  [        c  [        U5       H  n#[        [!        U5      5       H  n*[        UU*   R                  5       U5      u  n%n&U%U-  n+U&U-  n,UU+   R                  5       n-UU+S-      R                  5       n.UU#U%U%U-   2U&U&U-   24   n'[#        [        U-U.5      5       HR  u  n/nUU,U.-  UU,-
  U--  -   U/-      R                  5       n[        UU5      u  n(n)U'U U   UU#U(U(U-   2U)U)U-   24   -  -  n'MT     M     M     O<[        R$                  " SUR
                  UR                  S9n[        U UUUUUUU5        UR                  U"5      $ ['        U5      e)aU  Scattered matrix multiplication of tensors.

A scattered matrix multiplication is defined as a series of matrix
multiplications applied to input tensors according to the input
and output mappings specified by indices data.

The following indices data formats are supported for defining a
scattered matrix multiplication operation (:attr:`indices_data[0]`
holds the name of the indices data format as specified below):

- ``"scatter_mm"`` - matrix multiplications scattered in batches
  of tensors.

  If :attr:`blocks` is a :math:`(*  imes M  imes K) tensor,
  :attr:`others` is a :math:`(*     imes K  imes N)` tensor,
  :attr:`accumulators` is a :math:`(*       imes M  imes N)` tensor,
  and :attr:`indices = indices_data['indices']` is a :math:`(*
    imes 3)` tensor, then the operation is equivalent to the
  following code::

    c_offsets, pq = indices_data[1:]
    for r in range(len(c_offsets) - 1):
        for g in range(c_offsets[r], c_offsets[r + 1]):
            p, q = pq[g]
            accumulators[r] += blocks[p] @ others[q]

- ``"bsr_strided_mm"`` - matrix multiplications scattered in
  batches of tensors and a tensor.

  If :attr:`blocks` is a :math:`(Ms         imes Ks) tensor,
  :attr:`others` is a :math:`(*     imes K  imes N)` tensor,
  :attr:`accumulators` is a :math:`(*       imes M  imes N)` tensor, then
  the operation is equivalent to the following code::

    c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
    for b in range(nbatches):
        for i, r in enumerate(r_offsets):
            r0, r1 = divmod(r, N)
            acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
            for g in range(c_indices[i], c_indices[i + 1]):
                p = p_offsets[g]
                q0, q1 = divmod(q_offsets[g], N)
                acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]

  where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
  integer multiples of ``Ms`` and ``Ks``, respectively.

- ``"bsr_strided_mm_compressed"`` - matrix multiplications
  scattered in batches of tensors and a tensor. A memory and
  processor efficient version of ``"bsr_strided_mm"`` format.  If
  :attr:`blocks` is a :math:`(Ms    imes Ks) tensor, :attr:`others`
  is a :math:`(*    imes K  imes N)` tensor, :attr:`accumulators`
  is a :math:`(*    imes M  imes N)` tensor, then the operation is
  equivalent to the following code::

    c_indices, r_offsets, q_offsets, meta = indices_data[1:]
    for b in range(nbatches):
        for r in r_offsets:
            m = (r // N) // Ms
            n = (r % N) // Ns
            r0, r1 = divmod(r, N)
            c0, c1 = c_indices[m], c_indices[m + 1]
            acc = accumulators[b, r0 : r0 + Ms, r1 : r1 + Ns]
            for i, p in enumerate(range(c0, c1)):
                q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                q0, q1 = divmod(q, N)
                acc += blocks[p] @ others[b, q0 : q0 + Ks, q1 : q1 + Ns]

  where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
  integer multiples of ``Ms`` and ``Ks``, respectively.

  Notice that the order of ``r_offsets`` items can be arbitrary;
  this property enables defining swizzle operators via
  rearrangements of ``r_offsets`` items..

Auxiliary functions are provided for pre-computing
:attr:`indices_data`. For example,
:func:`bsr_scatter_mm_indices_data` is used to define indices data
for matrix multiplication of BSR and strided tensors.

Parameters
----------
blocks (Tensor): a 3-D tensor of first matrices to be multiplied

others (Tensor): a tensor of second matrices to be multiplied. If
  ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
  tensor of second input matrices to be multiplied. Otherwise, the
  second input matrices are slices of the :attr:`others` tensor.
indices_data (tuple): a format data that defines the inputs and
  outputs of scattered matrix multiplications.

Keyword arguments
-----------------

accumulators (Tensor, optional): a tensor of matrix product
  accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
  is a 1-D batch tensor of output matrices. Otherwise, output
  matrices are slices of the :attr:`accumulators` tensor.
r   rf   zblocks must be 3D, got D
scatter_mmr   Nzothers must be 3D, got z
blocks K (z) != others K ()r+   r   zaccumulators Ms (z) != blocks Ms (zaccumulators Ns (z) != others Ns (r:   bsr_strided_mmzK (z) must be divisible by Ks (SPLIT_Nr    accumulators N () != others N (bsr_strided_mm_compressed)r   )r   rA   r"   r   zerosr+   r   _scatter_mm2rh   r   r   item_scatter_mm6zero_divmodr   r@   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_format_PMsKs	c_offsetspq_QKs_NsRMs_Ns_rg0g1r|   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper;   r_r0r1accq0q1jr   r   c0c1is0                                                   r   r   r     s   H "!_N{{a6v{{m1EFFJBB%$QR(	;;! #:6;;-q!IJJll9 :bTQ!GHH"Q&A ;;6<<L ',,KACy$'8=MbTQR%STTy$'8=MbTQR%STT7b2gbL,@9??1-12q\q1u%r2Aa5DAq Ovay6!9'<<O ' 3  E	+	+||6",,1ar6Q; 3qc)DRD!JKK;G;K8	9iDy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EArQw$'7t?1#Q%OPP)// .'\7b2gbL,@ 1Xyq12A"1**,B"1**,B"1q5)..0B#B]FB&q"rBw,R"W'DEC"2r](|Yq\1!'!!4Bvay6!R"r'\2R<2O+PPP + 3  	   !344	6	6||6",,1ar6Q; 3qc)DRD!JKK0<QR0@-	9iy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EArQw$'7t?1#Q%OPP)// .'\7b2gbL,@1Xs9~.A#IaL$5$5$7;FBbAbA"1**,B"1q5)..0B&q"rBw,R"W'DEC )%B- 81%a"f!r/A&AA&EFKKM!'1Bvay6!R"r'\2R<2O+PPP !9 /  IOOI4D4DI 	   !344 ".11r   c           
      n   XgXX1S 1:X  Ga(  [         R                  R                  5       n[        SXX#U4US[         R                  S4S9nUb  UR
                  " S*0 UD6  U$ XU4S:X  aT  X44S:X  a  SnSnSnS	nSn
S	n	GOX44S
:X  a  SnSnSnS	nSn
S	n	GOX44S:X  a  SnSnSnS	nSn
S	n	GOX44S:X  a  SnSnSnSnSn
S	n	GOwXU4S:X  aT  X44S:X  a  SnSnSnSnSn
Sn	GOZX44S
:X  a  SnSnSnS	nSn
Sn	GOEX44S:X  a  S	nSnSnS	nSn
S	n	GO0X44S:X  a  SnSnSnS	nSn
S	n	GOXU4S:X  ad  X44S:X  a  S	nSnSnSnSn
Sn	OX44S
:X  a  SnSnSnSnSn
Sn	OX44S:X  a  SnSnSnS	nSn
Sn	OX44S:X  a  SnSnSnS	nSn
S	n	OX44S:X  a  SnSnSnSnSn
S	n	OXU4S:X  ad  X44S:X  a  S	nSnSnSnSn
Sn	OX44S
:X  a  S	nSnSnS	nSn
Sn	OX44S:X  a  S	nSnSnS	nSn
S	n	OkX44S:X  a  SnSnSnS	nSn
S	n	OWX44S:X  a  S	nSnSnSnSn
S	n	OCXU4S:X  a;  X44S:X  a  SnSnSnSnSn
Sn	O'X44S
:X  a  SnSnSnSnSn
Sn	OX44S:X  a  SnSnSnSnSn
S	n	Uc*  SSS	SSSSSSS.	R                  US5      nUS:  a  US:  a  SnX(-  nUc  [        US:  a  SOSU5      nUc  [        US:  a  SOSU5      nU
=(       d    Sn
U	c  [        X5      S:  a  SSSS.R                  US	5      n	O`[        X5      S:X  a  SSSS.R                  US	5      n	O:[        X5      S:X  a  SS	S.R                  US	5      n	OSSS.R                  US	5      n	U=(       d    S	nXc:  a  [        SU SU S 35      eX~:  a  [        S!U S"U S 35      eX0:  a  [        S#U S$U  S 35      eX:  a  [        S%U S&U S 35      eXA:  a  [        S'U S(U S 35      e[        S*UUUU
U	US).UD6$ )+Nr   r         ?version)   r   r   )r:   r:   r   r:      )    r   r	   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r:   r   r   r   r   r   r   r   i    r   r   r   )r:   r   r   )r:   r   zTILE_M (z) must be <= Ms (r   zTILE_N (z) must be <= Ns (zMs (z) must be <= M (zNs (z) must be <= N (zKs (z) must be <= K ()TILE_MTILE_N
GROUP_SIZE
num_stages	num_warpsr   r5   )
r   r   get_device_namer   float16updategetrD   rA   dict)r   r   r   r   r   r   r   r   r   r  r   extradevice_namer   r   s                  r   scatter_mm_metar	    s    	JCvMjj0021"s+	
 KK % K !9
"x8#

	X%

	X%

	Z'

	AY*$x8#

	X%

	X%

	Z'

	AY+%x8#

	X%

	X%

	Z'

	Z'

	AY+%x8#

	X%

	X%

	Z'

	Z'

	AY+%x8#

	X%

	X%

	 

 #a* 	 9dG	
B~28RR0~28RR0qJq9tA1-11"a8IY$A1-11"a8IY#A**2q1IA**2q1IqJ{xx/@AFGG{xx/@AFGG	vtB4'7s!<==	vtB4'7s!<==	vtB4'7s!<==   r   c                    Uc  [         R                  nUc  UnUc  SnXyX1S 1:X  GaP  [         R                  R                  5       nXX#XES:H  US:H  US:H  4nXL a  UnOX4n[	        SUUUUU4S9nUc  US:w  a  [	        SUUUUS4S9nUc  XLa  [	        SUUXS4S9nUc  [	        S/ US S QSPUSS  Q7UUUS4S9nUc  XLa  [	        S/ US S QSPUSS  Q7UXS4S9n[        U=(       d    0 5       H=  nUU   nUS   nUS	   nUU-  nUU-  S:X  d  M"  UU::  d  M*  [        U5      nUU-  US	'   M?     Ub  UR                  " S0 UD6  U$ [        S
U < SU< SU< SU< SU< SU< SU< SU< SU< 35        U=(       d    [        X#-  S5      nU=(       d    SnU
=(       d    Sn
U	=(       d    Sn	[        SUUU
U	S.UD6$ )Nr   r   r   bsr_dense_addmmr   r	   *rf   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=z dtype=z out_dtype=r   )r   GROUP_SIZE_ROWr   r  r5   )
r   r  r   r  r   sortedr  r  r   r   )r   r   r   r   r   betaalphar   r  r  r   sparsityr+   	out_dtype_versionr  r  keyversion_dtyper   matching_metamkeymeta_r   split_ncs                             r   bsr_dense_addmm_metar    s   * }	J7D6Ajj002QB	419eqjA!M!,M}h7	
 <HO!!=#6	D <E2!3hs=SD <$!)#bq')3)QR)!=#6	M $)? (%-c"1g-s-SW-%c2	! }23%d+G	*Lq5A:!q&;D&'1fDO 4 KK % K
 t4QD!Ure6bU'D7(E88UHLi\[
 (QWaG#(qNqJQI %	
  r   c                   :    \ rS rSrSrS rS rS r\S 5       r	Sr
g)	TensorAsKeyi[  a
  A light-weight wrapper of a tensor that enables storing tensors as
keys with efficient memory reference based comparison as an
approximation to data equality based keys.

Motivation: the hash value of a torch tensor is tensor instance
based that does not use data equality and makes the usage of
tensors as keys less useful. For instance, the result of
``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
the tensor results from `crow_indices` method call are equal, in
fact, these share the same data storage.
On the other hand, for efficient caching of tensors we want to
avoid calling torch.equal that compares tensors item-wise.

TensorAsKey offers a compromise in that it guarantees key equality
of tensors that references data in the same storage in the same
manner and without accessing underlying data. However, this
approach does not always guarantee correctness. For instance, for
a complex tensor ``x``, we have ``TensorAsKey(x) ==
TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
return False.
c                 t   S n[         R                  " U5      U l        UR                  [        R
                  L a  U" U5      U l        OUR                  [        R                  [        R                  1;   a2  U" UR                  5       5      U" UR                  5       5      4U l        OuUR                  [        R                  [        R                  1;   a2  U" UR                  5       5      U" UR                  5       5      4U l        O[        UR                  5      e[!        U R                  5      U l        g )Nc                 (   U R                   R                  (       d  U R                   R                  (       a  [        SU R                    35      eU R	                  5       U R                  5       U R                  U R                  5       U R                   4$ )Nz>TensorAsKey does not support floating point or complex dtype: )r+   is_floating_point
is_complexrA   data_ptrstorage_offsetr"   rE   )objs    r   get_tensor_key,TensorAsKey.__init__.<locals>.get_tensor_keys  st     yy**cii.B.B$TUXU^U^T_`  ""$		

		 r   )weakrefref_obj_refr   r   stridedr  
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr$  r%  s      r   __init__TensorAsKey.__init__r  s    	,  C(::&%c*DHZZE,,e.>.>??s//12s01DH ZZE,,e.>.>??s//12s01DH
 &cjj11$((^
r   c                     U R                   $ r   )r1  r2  s    r   __hash__TensorAsKey.__hash__  s    zzr   c                     [        U[        5      (       d  gU R                  b  UR                  c  XL $ U R                  UR                  :H  $ )NF)
isinstancer  r$  r  )r2  others     r   __eq__TensorAsKey.__eq__  sB    %--88uyy0 = xx599$$r   c                 "    U R                  5       $ )z'Return object if alive, otherwise None.)r)  r6  s    r   r$  TensorAsKey.obj  s     }}r   )r1  r)  r  N)__name__
__module____qualname____firstlineno____doc__r3  r7  r<  propertyr$  __static_attributes__r5   r   r   r  r  [  s+    ,&$P%  r   r  )maxsizec	           
      H   UR                   n	U	c  [        S5      eU	R                  5       U	R                  5       pU
R                  n[
        R                  nU S:X  Ga%  X7-  n/ n[
        R                  " X}US9U-  n[        X-  5       Hq  nU
U   R                  5       nU
US-      R                  5       nUU:X  a  M4  UR                  UUU XS-  -  R                  U5      UR                  UU-
  5      -   5        Ms     [
        R                  " U5      nU
R                  5       nUR                  5       nUXC-  -  nUU-   R!                  S5      nU
nUU   R                  U5      nUR#                  SSS9u  nnUU   nU UUU4$ U S:X  Ga  X7-  n/ n/ n[
        R                  " X}US9U-  n[        X-  5       H  nU
U   R                  5       nU
US-      R                  5       nUU:X  a  M4  UR                  [
        R                  " UUXS9R                  U5      5        UR                  UUU XS-  -  R                  U5      UR                  UU-
  5      -   5        M     [
        R                  " U5      nU
R                  5       nUR                  5       nUXC-  -  nUU-   R!                  S5      n[
        R                  " U
S S [
        R$                  " UU   R                  U5      S	5      45      n[
        R                  " U5      nU UUUU4$ U S
:X  a  UnS	/n/ n[        U5       H  n[        X-  5       H  nU
U   R                  5       nU
US-      R                  5       n[        X>-  5       Hm  nUR                  US   U-   U-
  5        [        UU-
  5       H>  n UU -   n!UU!   R                  5       UX%-  -  -   X>-  -  U-   n"UR                  U!U"/5        M@     Mo     M     M     U [
        R&                  " UXS9[
        R&                  " UXS94$ [)        SU < S35      e)Nz+compressed_sparse_tensor_as_key.obj is Noner   r   r   rv   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r$  rA   r   r   r   r   int32arangerh   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr;   r   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datar`    sK    *
-
-C
{JKK # 0 0 2COO4E+  FKKM44\LLfEJqwAa%%'Ba!e$))+BRx  R#rv.66w?%%b2g./   IIm,	(--/088: BF+ULL$	 	'(<=OOPWX*//4/MWg&		9i@@	+	+\LLfEJqwAa%%'Ba!e$))+BRx  R=HOOPWX   R#rv.66w?%%b2g./   IIm,	(--/088: BF+ULL$	IIRa %&:;MMgV
	 IIm,		9iKK	<	'C	
xA17^!!_))+!!a%(--/qwA$$Yr]R%7"%<="27^F(^002Q!']BqwORSS"))1a&1 , ( $ ! LL-GLL=H
 	
 &~''ef
 	
r   c                    U R                  5       S:w  a  [        SU R                  5        35      eU R                  S:w  a  [        SU R                   S35      eU R                  5       R                  SS nU R                  u  pVUu  pxUR                  SS u  pX:w  a  [        SU	 S	U S
35      eUR                  SS R                  5       n[        XVXU40 UD6nSU;  a:  UR                  U R                  [        R                  [        R                  1;   S9  US   n[        X%XjXxX[        U 5      5	      nUS:X  a  UR                  SS9  X4-   $ US:X  a  UR                  SS9  X4-   $ U$ )zcComputes indices data for :func:`scatter_mm` used in BSR and
strided tensor matrix multiplication.
r   zbsr.dense_dim() must be 0, got r	   z$bsr must be 2D (no batch dims), got r   r    Nz	other K (z) != bsr K (r   
allow_tf32rb  r   r   T)is_compressedr   F)	dense_dimrA   r   r   r"   numelr	  r  r+   r   r  r-   r`  r  )r   r;  r   
meta_inputr=   r   r   r   r   K_r   rU  r   r   r   s                  r   bsr_scatter_mm_indices_datari    sr    }}!>s}}>OPQQ
xx1}CCHH:QOPP

""23'I99DAFBKKEB	wyL1=>>{{3B%%'H19j9D:%syyU]]ENN,KKL9oG/1K<LL 44$'g%%	+	+%(g%%r   c           
      n   U R                   S:w  a  [        SU R                    S35      eUR                   S:  a  [        SUR                    S35      eU R                  S   U R                  S   UR                  S   penU R                  5       R                  SS nUc
  [	        XSS	9nUS
   nUc>  [
        R                  " / UR                  SS QUPUP7U R                  U R                  S9nUR                  n	[        U5      nU R                  5       S
:X  a  UR                  5         GOzUS;   a+  UR                  5         [        U R                  5       XUS9  GOIUS:X  Ga7  UR                  SS R                  5       n
[
        R                  " X-  US
   -  U-  US
   -  US
   US
   4U R                  U R                  S9n[        U5      R                  SS5      R!                  U
XgS
   -  US
   XWS   -  US   5      R#                  SS5      R%                  S
S5      n[        U R                  5       XUS9  UR'                  UR)                  S
XUS
   -  XgS
   -  45      R#                  SS5      R+                  XU5      R                  SS5      5        O[-        U5      eUR!                  U	5      $ )zBSR @ strided -> stridedr	   zbsr must be 2D, got r   z+other must have at least 2 dimensions, got r    rv   Nr   )r   r   r   >   r   r   r   r   r   )rf   r   r   r	   )r   r	   rf   r   )r   rA   r"   r   ri  r   r   r+   r   r   _nnzr   r   rf  r   r   r   movedimr   copy_	unflattenreshaper   )r   r;  r   outr   r   r   r=   r   	out_shaperU  r   r   s                r   bsr_scatter_mmrr  2  s    xx1}3CHH:Q?@@zzA~9%**QG
 	
 2		"u{{2BB

""23'I2'B
 "!_N
{kk'ekk#2''B'syy
 		I
C.C
xxzQ			J	J		3::<3G	<	';;s#))+{{1-2ilB!!
 ))::
 eYr2Tl"!l"! Wl WQ] 	 	3::<LQ		""HIaL0"!2DE Wl WX2&Yr2		
 ".1188Ir   Fr  r  
left_alpharight_alpharp  skip_checksmax_gridr   inputr   densert  ru  rp  rv  rw  r   c                j   Uc  UR                   [        R                  L a  SnUR                  5       nUR	                  5       S-
  nUR
                  U   nUR
                  S   n[        XU5      n[        R                  " UX4-   [        R                  UR                  S9n[        U UUUUUUUUU	U
S9$ )N_int_bsr_dense_addmmr   rv   r   rs  )r+   r   int8r   r!   r"   rT   r   rK  r   r  )rx  r   ry  r  r  rt  ru  rp  rv  rw  r   r   r   
batch_ndimr   r   original_batch_dims_broadcasteds                    r   r{  r{  }  s     {u{{ejj0''')!%%'!+
IIj!KKO*>vE*R'kk+qf4++<<

  r   c                	  ^^^
^ ^!^"^#^$^% SnUR                  5       nUR                  5       nUR                  5       nUR                  5       S-
  nUR                  XS-    u  nnUR                  US-   US-    nUR                  S   n[        XU5      nUc  UR                  UUU4-   5      nUR                  5       S:X  d  TS:X  d  US:X  d  US:X  d  US:X  aB  TS:X  a  UR                  5         U$ UR                  U 5        TS:w  a  UR                  T5        U$ Sm$Sm%Uc(  S	m$UR                  S
5      R                  " / UQUPUP76 nO*UR                  " / UQUPSP76 R                  " / UQUPUP76 nUc(  S	m%UR                  S
5      R                  " / UQUPUP76 nO*UR                  " / UQSPUP76 R                  " / UQUPUP76 nUR                  5       S   S:w  a  [        SUR                  5       S    35      eUR                  5       S   S:w  a  [        SUR                  5       S    35      eT
c[  [        SUR                  5       US   -  US   -  UU-  -  -
  S5      n[!        UUUUS   US   TTUUR"                  UR"                  S9
m
Un[%        XX%Xg5      u  nnnn nnnnUu  m!m T
R'                  SUT!-  5      nUU-  m"Un[)        UT!T"45      n[)        UT T"45      n[)        U T!T"45      n [)        UT!T"45      n[)        UT!T"45      n[*        R,                  [.        R0                  [*        R2                  [.        R0                  [*        R0                  [.        R4                  [*        R4                  [.        R4                  [*        R6                  [.        R8                  [*        R8                  [.        R8                  0UR"                     m#UR;                  S5      nUR;                  S5      S-
  nUR;                  S5      nUUU4nU	b*  [=        U	SS SSS2   5      SS[?        U	SS 5      -
  -  -   nOSnUSUSUSU SUSUSUSUS0nTS:X  a  [        S5      eU U!U"UUU#U$U
U%4	S jn[A        UUUU5        URC                  5       URC                  5       :w  a*  UR                  UR                  UR                  5      5        U$ )zCompute

  out = beta * input + left_alpha.reshape(-1, 1) * (alpha * (bsr @ dense)) * right_alpha.reshape(1, -1)

where left_alpha, right_alpha are (* + 1)-D tensors when
specified, otherwise, these are treated as tensors filled with
ones.
r  r   r	   rf   rv   Nr   FTr5   z'left_alpha.stride()[-1] must be 0, got r    z(right_alpha.stride()[-2] must be 0, got )r  r+   r  r   r   r   r   NNr   Nrv   )r   r   )r   r   Nzalpha must not be 0c                    >	 [         U    " / [        U6 QTPTP7TS:H  TS:g  TS:H  TT
TTTT[        R                  :H  TS.
T	D6  g )Nr   r   )
beta_is_onebeta_is_nonzeroalpha_is_oneleft_alpha_is_oneright_alpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLrb  	acc_dtype)_bsr_strided_addmm_kernelrd   tlfloat32)r{   r   BKBMBNr  r  dot_out_dtyper  r   r  s     r   r   bsr_dense_addmm.<locals>.kernel:  su    !$' 	
!>2	
 	
 		
 	 AI!/1$

2#1	
2 3	
r   )"r   r   r   r!   r"   rT   	new_emptyrk  r   rm  mul_expandr   rE   rA   roundr  r+   r   r  r   r   r  r  r  r-   float64r|  rK  r   r/   r@   r   r"  )&rx  r   ry  r  r  rt  ru  rp  rv  rw  r   r   r   r   r   r}  r   r   r=   r   r~  r  
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsrk   rl   rs   r   r  r  r  r  r  r  s&      ``     `                     @@@@@@r   r  r    s2   , FZZ\F##%L//#K!!#a'J99Zq.1DAqZ!^j1n=IBA ';6&N#
{oo=AFG
xxzQ%1*Q!q&AF19IIK
 
 IIeqy
 __R(// 
,
./
12

  __L&ELqL!LSS 
,
./
12

 !oob)00 
,
./
12
 "&&N(GNNANUU 
,
./
12
 2!#5j6G6G6I"6M5NO
 	
 B1$6{7I7I7KB7O6PQ
 	
 |SXXZ)A,61EQOOQRS#aLaL++ii
 J 	s5kG	 FBhhy!r'*G	
gBK
C"b
*Ceb"X.Eeb"X.E":Bx8J#K"b:K 	rzz

rzzrzz

BHHRXX 
iiM 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}K[[	O z233
 
< &/9kB
||~,,.. 	))**:*:;<r   IS_BETA_ZEROr  r  TILE_Kr  rb  c            
         [         R                  " SS9n [         R                  " SS9n!UUU -  -   UU!-  -   n"[         R                  " U"5      n#[         R                  " U"U-   5      n$U$U#-
  n%U%S:X  a  g [         R                  " SU5      n&[         R                  " SU5      n'UUU -  -   U	U#-  -   U
U&S S 2S 4   -  -   UU'S S S 24   -  -   n(UUU -  -   UU#-  -   n)UUU -  -   UU!-  -   UU&S S 2S 4   -  -   n*UUU -  -   UU'S S S 24   -  -   n+[         R                  " SU5      n,[	        U%5       GH1  n-[         R
                  " X44US9n.[         R                  " U)5      n/[	        SXV5       H  n0U0U,-   n1U1U:  n2[         R                  " U*UU1S S S 24   -  -   U2S S S 24   SS9n3[         R                  " U+UU/-  -   UU1S S 2S 4   -  -   U2S S 2S 4   SS9n4U.[         R                  " U3U4UUS9-  n.M     U(       a  U.U -  n.OU U.-  U[         R                  " U(5      -  -   n.[         R                  " U(U.R                  UR                  R                  5      5        U(U	-  n(U)U-  n)GM4     g )Nr   axisr   r+           maskr;  rb  r  )r  
program_idloadrL  rh   r   dotstoretor+   
element_ty)5r  r  r  r  r  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_strider  rb  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr  f  s   F MMq)	1- ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	 &23 :-. 	 )+,#m34 $&6q$w&??@ 	 )+,#&6tQw&??@ 	 		!V,wA-!?yQI  12I1-"]2	"QWW#&;ia>P&PPa	
  WW#+i78+i4.@@A  4
 RVV
zY 	) .0 U"	!I-rww?P7Q0QQ	 HH&	Z5E5E5P5P(QR !22!33Q  r   r  c           	         [         R                  " SS9n[         R                  " SS9n[         R                  " SS9n[         R                  " SS9n[         R                  " SS9n [         R                  " UUUU U5      u  nnUUU-  -   UU-  -   n![         R                  " U!5      n"[         R                  " U!U-   5      n#U#U"-
  n$U$S:X  a  g [         R
                  " SU5      n%[         R
                  " SU5      n&U UU-  -   UU"-  -   UU%S S 2S 4   -  -   UU&S S S 24   -  -   n'UUU-  -   UU-  -   UU&S S 2S 4   -  -   UU%S S S 24   -  -   n(UUU-  -   UU-  -   UU-  -   UU%S S 2S 4   -  -   UU%S S S 24   -  -   n)UU	U-  -   U
U"-  -   n*[         R                  " UU4US9n+[        U$5       Ho  n,[         R                  " U'5      n-[         R                  " U*5      n.[         R                  " U(UU.-  -   5      n/U+[         R                  " U-U/UUS9-  n+U'U-  n'U*U
-  n*Mq     [         R                  " U)U+R                  UR                  R                  5      5        g )Nr	   r  r   r   r  r  )r  r  num_programs	swizzle2dr  rL  r   rh   r  r  r  r+   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_strider  r  r  rb  r  r  r  col_block_pidr  r  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr    s   \ MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	  9,-$}45 %'74'@@A %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)TwA77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#  ( 	.11*2B2B2M2MNOr   c           
        ^ ^^^^^^^ UR                  S5      nUR                  S5      S-
  nX4nUb*  [        US S S S S2   5      SS[        US S 5      -
  -  -   nOS nUSUSUSU	SU
S0nUR                  [        R
                  [        R                  4;   a  [        R                  mS	mO[        R                  mS
mUUU UUUUU4S jn[        UUX5        g )Nr   rv   r   r	   r   )r   N)r   rv   )r   r  TFc                 N   > [         U    " TTT/TQTPT	P[        U6 Q7TTSSS.6  g )Nr   r   )r  rb  r   r  )r  rd   )
r{   r   r  rb  r  r  r=   is_beta_zeror  tile_ks
     r   r   )_run_sampled_addmm_kernel.<locals>.kernel  sY    !$' 	   &~6 $%!r   )r   r/   r@   r+   r   r,   r-   r  r  r  r   )r  r  r  r=   r  r  r   r   r   mat1mat2rw  r  r  rk   rl   rs   r   r  rb  s   ``````            @@r   _run_sampled_addmm_kernelr  j  s     KKN	#((,q0-	!TrT 23gSRTSTEVAV6WWKKI'')
 <<EJJ77

IJ

IJ	 	( 	foyFr   g      ?)r  r  rp  rv  rw  r  r  c                n   Sn[        X5        [        XX5      n	U(       Gd  [        XU R                  5        [        XU R                  5        US:w  a/  U R                  [
        R                  L a  [        SU SU S35        U R                  [
        R                  La-  [        XU R                  5        [        XU R                  5        O[        XUR                  5        [        XU5        Ub  [        X5        [        XUR                  5        [        XU R                  5        [        UR                  U	R                  :H  =(       a!    UR                  5       U R                  5       :H  U SU	R                   SU	R                  5        SUR                   S	UR                  5        3	5        Uc  U	R                  UR                  S
S9nOUR                  U	5        UR                  5       S:X  d  UR                  5       S:X  a  U$ UR                  5       R                  SS  n
UR!                  S5      nUS:X  d  US:X  a!  UR                  5       R#                  U5        U$ Un[%        XQU5      u  ppn['        XS   U45      n['        X+U
S   45      n[)        U
6 n[+        UUUS:H  U
UUUUUUUU5        UR                  5       R-                  5       SS  UR-                  5       SS  :w  aF  UR                  5       R                  UR/                  UR                  5       R                  5      5        U$ )Nsampled_addmmr  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r    rv   r   r   )r   r   r   r   r+   r   boolr   r1   r)   r"   rk  r  rm  rf  r   r   r  r   r   r   r  rE   ro  )rx  r  r  r  r  rp  rv  rw  r   input_broadcastedr=   r  r  r   r   r   r  s                    r   r  r    s    !'4VDOu||4u||4s{u{{ejj8h1$7jk {{%**,F%++6F%++6F$**5&vT: -V$++6F5II!2!8!88WSXXZ5::<=Wh?@Q@W@W?X Y-->-C-C-E,F G++.99+_SXXZLR ;#&&tzz&=CII'(99;!sxxzQJJJL&&rs+	IIbM C<16JJLd#J 
8FsRV8W56 !a'89 9Q<'89i!CK	
$ %%',0DD%%fnnZ5F5F5H5N5N&OPr   )rp  rv  rw  r   c          	      N   SnU R                   SS  u  pxU(       d  [        X`5        [        X`UR                  5        [	        X`UR
                  [        R                  45        [        X`U5        UR                  S5      n	U R                  5       R                   SS  u  p[        XjU45        [        U	S-  (       + U SU	 S35        OUR                   SS  u  p[        X`U5      nUbx  U(       dq  XU	4-   n[        UR                   U:H  SU SUR                    S	35        [        UR                  5       =(       d     UR                  SS5      R                  5       S
5        Uc  UR!                  XU	4-   5      nU R#                  5       S:X  a  UR%                  5       $ ['        X USSUS9$ )Nbsr_dense_mmr    rv   r:   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r   zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r  r  rp  )r"   r   r   r   r1   r+   r   r|  r)   r   r   rB   r   rT   is_contiguousr   r  rk  r   r  )r   ry  rp  rv  rw  r   r   r   _klr   	row_blockr  _krr~  expected_out_shapes                  r   r  r    s     23V)ell3U[[5::-@&vE:

2A#&::<#5#5bc#: IF	$:;F
(03MN
 [[%FC*>vE*R'?;!@q6!I		//./z#))AG
 !!#Ls}}R'<'J'J'L" ;//"AF"JKC 88:?99; saaSIIr   MAX_ROW_NNZTILEc                    [         R                  " SS9n[         R                  " SS9n[         R                  " SS9nU X-  -   X--  -   n[         R                  " U5      n[         R                  " X-   5      nUU-
  nUS:X  a  g [         R                  " SU
5      nUUU-  :  nUXK-  -   X\-  -   X-  -   n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " USS9n[        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " USS9n[         R                  " UU:  UU5      nM     [         R                  " UU-
  5      n[         R                  " USS9n[        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " UU-
  5      nU[         R                  " USS9-  nM     [         R                  " UU-   UU-  R                  UR                  R                  5      US9  [        XU
5       H  nUU
-  nUUU-  :  n[         R                  " UU-   U[	        S5      * S9R                  [         R                  5      n[         R                  " UU-
  5      n[         R                  " UU-   UU-  R                  UR                  R                  5      US9  M     g )Nr	   r  r   r   infr  )r  )r  r  r  rL  r.   r  r  r   rh   whereexpsumr  r+   r  )r  r  r  r  r  r  values_nnz_col_block_strider  r  r  r	  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr  2  s'    MMq)	!}}!41- '34!12 	 
 WW45
''"9"OP "J.a<YYq$'
Gi// !-.%<= $% 	 77 :-Du

"RZZ. 	 xa0t$/A$J) 33Dww$z1U5\Mbn  "$q!9HH 22MCUM 0 ffX-.s#t$/A$J) 33Dww$z1U5\Mbn  &&M12CRVVCa((E 0 	 :-5[Z--889	

 t$/A$J) 33Dww$z1U5\Mbn  &&M12CHH$z1u  !1!1!<!<= 0r   c                 2  ^^^ Sn[        X 5        [        X U R                  5        U R                  5       S:X  d  U R	                  5       S:X  a  U R                  5       $ U R                  SS  u  p4U R                  5       nU R                  5       R                  SS  u  mmTc  [        R                  " U5      mO[        R                  " T5      mU R                  5       R                  S5      R                  SS5      nU R                  5       R                  SS5      R                  5       (       a  U R                  5       R                  5       nOU R                  5       nUR                  SS5      R                  5       R                  S5      R                  SS5      R!                  STUT-  5      nUR                  S   TUT-  4nS n	USS S24   SUS	0n
UUU4S
 jn[#        XX5        UR!                  STUT5      R                  SS5      R                   " U R                  5       R                  6 n[$        R&                  " U R                  5       R                  5       U R)                  5       R                  5       UU R                  U R*                  S9$ )Nbsr_softmaxr   r    r   r  rv   .r  r  c                 T   > [         U    " / [        U6 QTPTPTP[        ST5      P76   g )Ni   )r  rd   rD   )r{   r   r  max_row_nnzr  s     r   r   bsr_softmax.<locals>.kernel  sL    % %~6  	
  E;'r   r   )r   r1   r+   rk  rf  cloner"   r   tritonnext_power_of_2r   r   r   r   r  rF   ro  r   r   r   r   r   )rx  r  r   r   r   nnzr   r   rk   rl   rs   r   r  r  s    `          @@r   r  r    s3   'F5;;/::<1 2;;= {{23jjl$||~33BC8	9 003K 00=K))+55a8@@BG <<>##B+99;;\\^))+F\\^FR$Z\Yq\WQ^WRC)O4 	 \\!_ii@	 crc"MO	
		 	fyF NN2y#y9Yr2Wlln**, 	 -- &&(%%'<<
 	
r   queryr  value	attn_mask	dropout_p	is_causalscalec           	         Sn[        U(       + U S35        [        US LU S35        Uc  [        S5      e[        UR                  [        R                  :H  U S[        R                   SUR                   S35        [        XqU R                  5        [        XrU R                  5        [        XsU R                  5        [        XqU R                  5        [        XrU R                  5        UR                  [        R                  La  [        XsU R                  5        [        X0UR                  SS	5      S
SS9nUc  U R                  S	5      S:X  d  US
:X  a  [        SU SU S35        Uc(  S[        R                  " U R                  S	5      5      -  OUn	UR                  5       R!                  U	5        [#        U5      n[        R$                  R&                  R)                  UR                  5       USS9  [+        X5      nU$ )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.zattn_mask must not be Nonez(): attn_mask.layout must be z, but got attn_mask.layout == r   r    rv   r  F)r  rv  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   rA   r   r   r   r   r   r1   r+   r  r  r   r   mathsqrtr   r  r  nn
functionaldropoutr  )
r"  r  r#  r$  r%  r&  r'  r   sdpascale_factors
             r   r)  r)    s    1)mx'NOPit#x/V%WX !=>> 0 00h ((-(8(8'9 :##,#3#3"4A7	
 	V%,,/VELL1V5F-F5;;/??%**,5;;7 cmmB3#5
 =UZZ^q0ESL(8 @/ /
 9>q499UZZ^445<(4 ##DKKMY#MD(r   r   r   r   r  r   r   c           	      .   U U-  n[         R                  " SS9n[         R                  " SS9nUU-  nUU-  nUU-  [         R                  " SU5      -   nUU-  [         R                  " SU5      -   n[         R                  " SU5      nUUS S 2S 4   U-  US S S 24   U-  -   -   n UUS S 2S 4   U	-  US S S 24   U
-  -   -   n![         R                  " UUU-  -   5      n"[         R                  " UUS-   U-  -   5      n#U"U#:X  a  g [         R                  " UU4US9n$[        U"U#5       H  n%[         R                  " UU%U-  -   5      n&[         R                  " UU%U-  -   U-   5      n'[         R                  " U U&U-  -   5      n([         R                  " U!U'U-  -   5      n)U$[         R                  " U(U)UUS9-  n$M     UUU-  -   US S 2S 4   U-  US S S 24   U-  -   -   n*[         R                  " U*U$R                  UR                  R                  5      5        g Nr   r  r   r  )r  rb  )r  r  rL  r  r   rh   r  r  r  r+   r  )+r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1r  r   r   rb  r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs+                                              r   _scatter_mm2_kernelrP    s%   6 &[1%mm#r	bV^bii622V^bii622YYq!_q$wK/)BtQwK/,II
 q$wK/)BtQwK/,II
 WW^e.?&??@WW^uqy4E&EEF8HHff-]C	r2A[01A[0;>?AO 334AO 334A1*UUI  ++, 1d733T1W+ 556 	 		%5%;%;%F%FGHr   r   r   r_  
pq_indicesr   c                   ^^^ U R                   u  nmnUR                   u  pxm[        [        STS-  5      [        STS-  5      SSS9n	UUU4S jn
[        R                  [
        R                  [        R                  [
        R                  [        R                  [
        R                  [        R                  [
        R                  0UR                     nSU	;  a   U	R                  U[
        R                  :H  S9  [        U
   " TUTU U R                  S	5      U R                  S5      U R                  S5      UUR                  S	5      UR                  S5      UR                  S5      UUR                  S	5      UR                  S5      UR                  S5      TTR                  S	5      UUR                  S	5      UR                  S5      4S
U0U	D6  g )Nr:   r   r   r	   )r   r   r   r  c                    > TR                   S   S-
  [        R                  " TU S   5      [        R                  " TU S   5      -  S4$ )Nr   r   r   r   r"   r  cdiv)METAr   r   r_  s    r   r{   _scatter_mm2.<locals>.gridZ  sI      #a'AtH~.QX1OO r   rb  rc  r   r  )r"   r  r   r   r  r  r  r-   r  r+   r  rP  rE   )r   r   r_  rQ  r   r   r   r   r  r   r{   r  r   r   s     `         @@r   r   r   L  s    <<Aq<<qr16?3r16?qTU
	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?D!MM!MM!MM!MM!MM!MM!"""a a a /	
2 (3	
6 7	
r   r   rd  r   r   c           	         UU-  nUU-  nUU-  n[         R                  " SS9n[         R                  " SS9nUU -  n UU -  n!UU-  n"UU"-  n#U#U-  n$[        UU$-
  U5      n%U$UU%-  -   n&UU"-  U%-  n'U&U-  [         R                  " SU5      -   n(U'U-  [         R                  " SU5      -   n)[         R                  " SU5      n*UU(S S 2S 4   U-  U*S S S 24   U-  -   -   n+UU U	-  -   U*S S 2S 4   U
-  U)S S S 24   U-  -   -   n,[         R                  " UU!-   5      n-U(       aY  U-U-  U-  n.U-U-  U-  n/[         R                  " UU.-   5      n0[         R                  " UU.-   S-   5      n1U/U1-  UU/-
  U0-  -   n2U1U0-
  n3O:[         R                  " UU!-   5      n2[         R                  " UU!-   S-   5      n4U4U2-
  n3UU2-   n5[         R
                  " UU4US9n6U(       a  U+W0U-  -  n+[        U35       Hl  n7[         R                  " U55      n8[         R                  " U,U8-   5      n9[         R                  " U+5      n:U6[         R                  " U:U9UUS9-  n6U+U-  n+U5S-  n5Mn     OUU2-   n;[        U35       H  n7[         R                  " U55      n8[         R                  " U,U8-   5      n9[         R                  " U;5      n<[         R                  " U+U<U-  -   5      n:U;S-  n;U5S-  n5U6[         R                  " U:U9UUS9-  n6M     UU--   U U-  -   U(S S 2S 4   U-  U)S S S 24   U-  -   -   n=[         R                  " U=U6R                  UR                  R                  5      5        g r3  )r  r  rD   rL  r  r   rh   r  r  r  r+   r  )>rU  r   r   r   r4  r5  r6  r7  r8  others_stride_Br:  r;  r<  accumulators_stride_Br>  r?  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrrd  r  r   r   r   r   rb  r   BLOCKS_MBLOCKS_Npid_t_rF  pid_brE  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mrG  rH  rI  rJ  rK  rL  rM  r   r   r   r   r   r   r!  r   q_ptrr  r  r   r   rN  p_ptrr   rO  s>                                                                 r   _scatter_mm6_kernelri    s   < '\<<A&mm#!("%0**+8k1:>s\12''L8V^bii622V^bii622YYq"q$wK/)BtQwK/,II
 o%&!T'{_,r$'{_/LLN 	 GGME)*aBAQ2A*+B*Q./BR7Q;",,Br'C./B.23Br'C"HHff-]C	R/))E3ZGGENGGEAI&GGENRVVqMj 	 (
   "B&E3ZGGENGGEAI&GGENGGEA$778

RVVqMj 	   ++, 1d733T1W+ 556	 	 		%5%;%;%F%FGHr   r   r   r   r   force_contiguousc	                 0  ^^^^ US   n	U R                   u  n
mnUR                   u  mpUR                   u  pnUU:w  a  [        SU SU S35      eX-  mUT:w  a  [        SU ST S35      eUUUU4S jn[        R                  [        R
                  [        R                  [        R
                  [        R
                  [        R                  [        R                  [        R                  0UR                     nSU;  a   UR                  U[        R
                  :H  S	9  UR                  S
5      S:w  a  [        SUR                  S
5       35      eTR                  S
5      S:w  a  [        STR                  S
5       35      eUR                  S
5      S:w  a  [        SUR                  S
5       35      eUR                  S
5      S:w  a  [        SUR                  S
5       35      eU(       aI  U R                  5       n UR                  5       nUR                  5       (       d  UR                  5       nOUnOUn[        U   " TTUUU U R                  S
5      U R                  S5      U R                  S5      UUR                  S
5      UR                  S5      UR                  S5      UUR                  S
5      UR                  S5      UR                  S5      UTUU4SU0UD6  U(       a(  UR                  5       (       d  UR                  U5        g g g )Nr   r   r   r   zaccumulators B (z) != others B (c                    > TR                   S   T-  [        R                  " TU S   5      [        R                  " TU S   5      -  4$ )Nr   r   r   rT  )rV  r   r   r   r   s    r   r{   _scatter_mm6.<locals>.grid	  sD    "Q&BX/&++b$x.2QQ r   rb  rc  r   r   z#c_indices.stride(0) must be 1, got z#r_offsets.stride(0) must be 1, got z#p_offsets.stride(0) must be 1, got z#q_offsets.stride(0) must be 1, got r	   r  )r"   rA   r   r  r  r  r-   r  r+   r  rE   rF   r  ri  rm  )r   r   r   r   r   r   r   r   rj  r   r   r   _Kr   B__Mr   r{   r  accumulators_r   r   r   s      `                @@@r   r   r     s     y/\\
B<<2!''
7 #3B4qc!KLL\7 #3B4qc!KLL	 	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?A!# 5i6F6Fq6I5JK  A!# 5i6F6Fq6I5JK  A!# 5i6F6Fq6I5JK  A!# 5i6F6Fq6I5JK " &&(F&&(F--// , 7 7 9 ,(MD!MM!MM!MM!MM!MM!MM!  #  #  #+	
. (/	
0 1	
6 L$>$>$@$@}- %Ar   left_alpha_tiled_col_strideleft_alpha_col_block_strideright_alpha_tiled_row_strideright_alpha_row_block_strider  r  r  r  r  r  c7           	         US:w  a  [        SU 35      eUS:w  a  [        SU 35      eUS:w  a  [        SU 35      eU!S:w  a  [        SU! 35      e[        R                  " SS9n7[        R                  " SS9n8[        R                  " SS9n9[        R                  " SS9n:[        R                  " SS9n;[        R                  " U8U9U:U;U55      u  n8n9UUU7-  -   UU8-  -   n<[        R
                  " U<5      n=[        R
                  " U<U-   5      n>U>U=-
  n?[        R                  " SU05      n@[        R                  " SU25      nA[        R                  " SU15      nBU UU7-  -   UU=-  -   UU@S S 2S 4   -  -   UUAS S S 24   -  -   nCUUU7-  -   UU9-  -   UUAS S 2S 4   -  -   UUBS S S 24   -  -   nDU#U$U7-  -   U%U8-  -   U&U9-  -   U'U@S S 2S 4   -  -   U(UBS S S 24   -  -   nEUU	U7-  -   U
U=-  -   nF[        R                  " U0U14U3S	9nG[        U?5       Ho  nH[        R
                  " WC5      nI[        R
                  " WF5      nJ[        R
                  " WDUUJ-  -   5      nKWG[        R                  " UIUKU4U3S
9-  nGUCU-  nCUFU
-  nFMq     U-(       d  WGU*-  nGU.(       dG  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWBS S S 24   -  -   nLWG[        R
                  " UL5      -  nGU/(       dG  UUU7-  -   UU8-  -   U U9-  -   U!W@S S 2S 4   -  -   U"WBS S S 24   -  -   nMWG[        R
                  " UM5      -  nGU,(       ak  UUU7-  -   UU8-  -   UU9-  -   UW@S S 2S 4   -  -   UWBS S S 24   -  -   nNU+(       a  WG[        R
                  " WN5      -  nGOWGU)[        R
                  " WN5      -  -  nG[        R                  " WEWGR                  U#R                  R                  5      5        g )Nr   z+left_alpha_tiled_col_stride must be 0, got z+left_alpha_col_block_stride must be 0, got z,right_alpha_tiled_row_stride must be 0, got z,right_alpha_row_block_stride must be 0, got r	   r  r   r  r  )rA   r  r  r  r  r  rL  r   rh   r  r  r  r+   r  )Or  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  left_alpha_ptrleft_alpha_batch_strideleft_alpha_tiled_row_striderr  left_alpha_row_block_striders  right_alpha_ptrright_alpha_batch_stridert  right_alpha_tiled_col_strideru  right_alpha_col_block_strider  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rb  r  r   r  r  r  r  r  r  r  r  r  r  inner_block_aranger  r  r  r  r  r  r  r  r  r  left_alpha_ptrsright_alpha_ptrs
input_ptrssO                                                                                  r   r  r  c	  s%   V '!+ =>Y=Z[  '!+ =>Y=Z[  (1, >?[>\]  (1, >?[>\]  MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:99Q6 !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)TwA77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#  & % )I56-=> .=> .0@D0II	J
 .0@q0IIJ   88!*Y67.>? />? /1A!T'1JJ	K
 /1A$'1JJK  (8 99$y01(=89 )=89 )+;AtG+DD	E
 )+;D!G+DDE   BGGJ$77  D277:+>$>>  	.11*2B2B2M2MNOr   r   )NNNNNN)NNNNNNNr   )r   )NN)r  FN)T)Ar+  osr'  	functoolsr   r   torch._dynamo.utilsr   torch.utils._tritonr   _triton_ops_metar   intgetenvr   r   r   r   r)   r1   rB   rG   rT   rY   rb   rd   r}   r   r   r   r   r   r   r	  r  r  r`  ri  rr  Tensorr  r/   r  r{  r  r  triton.languagelanguager  jit	constexprr  r  r  r  r  r  r  r.   r)  rP  r   ri  r   r  r5   r   r   <module>r     s    	    ) * & .1II:A>. *

"*&U;8&2 7F	
/" >B w2@ pv 
`FN Nb =>^
 ?^
D  ;!HH` 

&*'+#AE&<<&	& <<& t#& $& 
	& & C$Jd
C$J67$>& +&\ 

&*'+#AEy<<y	y <<y t#y $y 
	y y C$Jd
C$J67$>y +yx << ZZ}4 ll}4 ||	}4
 ||}4 }4> <<?}4@ LLA}4 }4~ ZZAPN ||OAPP ||QAPR <<SAPT LLUAPV WAP APF8G~ #'!EIU||UllU llU \\D U U d
C$Jd
:;dBUv $(!EI 5J\\5J||5J \\D 	5J
 5J d
C$Jd
:;dB5J Tk5Jn ZZV \\V llV VpF
Z "0||0\\0 ||0 <<$&	0
 0 0 t|0d ZZDI<<DI<<DI <<DI* ||+DI, -DI. /DI0 LL1DI DIL9
9
9
 LL9
 LL	9

 ll9
v ZZoI LLoI* ||+oI, ||-oI. /oI0 1oI2 3oI4 LL5oI6 LL7oI oIt "&h.h.h. <<h. <<	h.
 <<h. <<h. h. llh. h.T ZZUPL &(\\MUPP &(\\QUPZ ')ll[UP^ ')ll_UPx \\yUPz {UP| ll}UP~ <<UP@ LLAUPB ||CUPD ||EUPF GUPH <<IUPJ LLKUPL MUPN OUP UPp KLM$(!LL $r   