
    hWE                        S SK JrJr  S SKJrJr  S SKr\" SS9 " S S5      5       r\" 5       rS\	S	\	4S
 jr
\R                  4S\R                  S\	S\	S\R                  S\	S	\R                  4S jjr\R                  4S\R                  S\	S\	S\R                  S\	S	\R                  4S jjrS\	S	\\R                  \\	   \\	   4   4S jrS\	S	\\R                  \R                  \R                  4   4S jrS rS rS rg)    )	dataclassfield)ListTupleNT)frozenc                   |    \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   \	" S S	9r
\\   \S
'   \	" S S	9r\\   \S'   Srg)Marlin24Constants      TILE   MIN_THREAD_N@   MAX_PARALLELc                  
    SS/$ )N       r       W/home/james-whalen/.local/lib/python3.13/site-packages/torchao/sparsity/marlin/utils.py<lambda>Marlin24Constants.<lambda>   s    1a&r   )default_factorySUPPORTED_NUM_BITSc                  
    / SQ$ )N)    r   r   r   r   r   r   r   r      s    EVr   SUPPORTED_GROUP_SIZESr   N)__name__
__module____qualname____firstlineno__r   int__annotations__r   r   r   r   r   r   __static_attributes__r   r   r   r	   r	      sJ    D#NL#L# %*.$IS	I',=V'W49Wr   r	   num_bitsreturnc                 H    U [         R                  ;   d
   SU  35       eSU -  $ )zCompute the packing factor for a given number of bits.

Args:
    num_bits (int): Number of bits to pack.
Returns:
    int: The packing factor.
zUnsupported num_bits = r   )constr   )r&   s    r   get_pack_factorr*      s/     u///U3J8*1UU/>r   q_wsize_ksize_npermtilec                    U R                   X4:X  d   eX-  S:X  d   SU SU 35       eX$-  S:X  d   SU SU 35       eU R                  X-  XBU-  U45      n U R                  S5      n U R                  X-  X$-  45      n U R                  SUR                  5       45      SS2U4   R                  U R                   5      n U $ )av  Permute weights to 16x64 Marlin tiles.

Args:
    q_w (torch.Tensor): Quantized weights.
    size_k (int): Number of input features.
    size_n (int): Number of output features.
    perm (torch.Tensor): The computed permutation tensor to be applied.
    tile (int, optional): Tile size. Defaults to `TILE`.
Returns:
    torch.Tensor: Weight tensor permuted to Marlin tiles.
r   	size_k = 	, tile = r            r   N)shapereshapepermutenumel)r+   r,   r-   r.   r/   s        r   marlin_permute_weightsr;   '   s    & 99((((=AB6()D6BB=AB6()D6BB ++v~tt^TB
CC
++l
#C
++v~v}5
6C
++r4::<(
)!T'
2
:
:399
ECJr   q_w_unpackedreverse_permc                    U R                   S   U4X-  U R                   S   U-  4:X  d   eX-  S:X  d   SU SU 35       eX$-  S:X  d   SU SU 35       eU R                  SUR                  5       45      SS2U4   R                  U R                   5      nUR                  X-  X$-  XD45      nUR                  S5      nUR                  X45      nU$ )a  Reverse permute weights from 16x64 Marlin tiles.
Args:
    q_w_unpacked (torch.Tensor): Unpacked quantized weights.
    size_k (int): Number of input features.
    size_n (int): Number of output features.
    reverse_perm (torch.Tensor): The computed reverse permutation tensor to be applied.
    tile (int, optional): Tile size. Defaults to `TILE`.
Returns:
    torch.Tensor: Weight tensor reverse permuted from Marlin tiles.
r   r5   r1   r2   r   Nr3   )r7   r8   r:   r9   )r<   r,   r-   r=   r/   q_w_comps         r   reverse_marlin_permute_weightsr@   H   s   $ q!6*1%/    =AB6()D6BB=AB6()D6BB ##R););)=$>?	<gl  !   LMH-H 01HOr   c           
         / n[        S5       H  n/ nUS-  nUS-  nS H]  nSUS-  -  SUS-  -  S-   SUS-  S-   -  SUS-  S-   -  S-   4 H,  nUR                  SU-  US-  -   SUS-  -  -   SU-  -   5        M.     M_     [        S5       H*  nUR                  U V	s/ s H
  oSU-  -   PM     sn	5        M,     M     [        R                  " U[        R
                  S	9n
U S:X  a&  [        R                  " / S
Q[        R
                  S	9nOFU S:X  a&  [        R                  " / SQ[        R
                  S	9nO[        SR                  U 5      5      eU
R                  S[        U5      5      SS2U4   R                  S5      n
/ n[        S5       H*  nUR                  S Vs/ s H
  oS-  U-   PM     sn5        M,     / n[        S5       H+  nUR                  S Vs/ s H  nSU-  U-   PM     sn5        M-     XU4$ s  sn	f s  snf s  snf )a  Precompute permutations for Marlin24 weight and scale shuffling

Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible
with the tensor-core format that is described here:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type

As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core
(without the need to use ldmatrix instructions)

Args:
    num_bits (int): Number of bits to pack.
Returns:
    Tuple[torch.Tensor, List[int], List[int]]: The weight permutation tensor, scale permutation list, and
    scale permutation list for a single group.
r   r   r4   )r   r5   r5   r      r   )dtype)r   r4   r      r5   r6         r3   znum_bits must be 4 or 8, got {}r   N)r   r   r5   rE   r4   rD   r6   rF   )r   r5   r4   r6   r   rE   rD   rF   )rangeappendextendtorchtensorint32
ValueErrorformatviewlenr8   )r&   	perm_listiperm1colcol_oblockrowjpr.   
interleave
scale_permscale_perm_singles                 r   get_perms_24r]   l   s	     I2Y1fqEQUQUaQUQYQUQY!#	 R#X3a37mCa%iOP  qA7A!a%i78    <<	5D1}\\":%++N
	Q\\,ekkB
:AA(KLL 99RZ)!Z-8@@DDJ1X.FG.Fq519.FGH  $&1X  5M!N5M!a%!)5M!NO  .../ 8" H "Os   G>"H
H
c                     [        U 5      u  pnUR                  5       n[        R                  " U5      R                  5       n[        R                  " U5      R                  5       nXEU4$ )a2  Reverse permutation for Marlin24 weight and scale shuffling from `get_perms_24`.

Args:
    num_bits (int): Number of bits to pack.
Returns:
    Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The reversed weight permutation tensor, scale permutation list and
    scale permutation list for single group.
)r]   argsortrJ   rK   )r&   perm_24scale_perm_24scale_perm_single_24r.   r[   r\   s          r   get_reverse_perms_24rc      s]     4@3I0G0??Dm,446J%9:BBD...r   c                 X   [         R                  " SXS9S S 2S 4   R                  SU5      n[         R                  " SXS9R                  U S5      nSnUR                  S:X  a  SOSnXF-  U-  US-  S-  -   US-  S	-  -   XG-  S	-  S-  S-  -   XF-  S-  S	-  -   nUS-  S:H  US-  S:H  -  R	                  [         R
                  5      nUS-  S:H  US-  S:H  -  R	                  [         R
                  5      n	XHU	-
  -  nXXU	-
  -  nSn
XZ-  nXZ-  nX-  U
-  XJ-  -   U-   R                  S
5      $ )Nr   devicer5   r   r4   r   r   r   r   r   )rJ   arangerepeatitemsizetoint8rO   )m
meta_ncols
meta_dtyperf   dst_rowsdst_colsgroup_xgroup_ytopright
bottomleftrZ   cols_majcols_mins                r   *_calculate_meta_reordering_scatter_offsetsrw      su   ||Aq0D9@@JOH||Az9@@AFH G''1,b"G 	g%a<1
	a<A
	 !#
)B
.	/ 1$
)		*  A"x!|q'89==ejjIHa<1$A):;??

KJ:%%H:%%H
 J%H$HL:%(==HNNrRRr   c                    U R                  5       S:w  a  [        SU R                  5        S35      eU R                  u  pU R                  n[        R
                  nU R                  [        R
                  :X  a  [        R                  nOvU R                  [        R                  [        R                  [        R                  [        R                  4;   a  [        R                  nO[        SU R                   S35      eUR                  S-  S-  nUS;  a  [        S	5      eU[        R                  :X  a  US
-  S:w  a  [        SU S35      eOUS-  S:w  a  [        SU S35      eUSU-  -  S:w  a  [        SU SSU-   35      eU R                  [        R                  :w  a/  SnU R                  SX&-  U5      nUS:g  R                  S5      u  ppO1SnU R                  SX&-  U5      nUS:g  R                  S5      =u  pu  pX&U-  -  nX-  nU) U	-  nU) U	) -  nUnUnUU-  U-  nX) -  nUUR                  [        R                   5      S-  -  nUUR                  [        R                   5      S-  -  nU R                  [        R                  :w  al  WR#                  SUR%                  S5      5      nUR#                  SUR%                  S5      5      n[        R&                  " UU4SS9R                  XS-  5      nO6WR#                  SUR%                  S5      S-  5      R                  XS-  5      nUUS-  -  nUR                  SX45      R                  U5      nUS:X  a=  US S 2S S 2S4   US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  nOUS:X  a|  US S 2S S 2S4   US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  US S 2S S 2S4   S
-  -  US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  US S 2S S 2S4   S-  -  nWR)                  X-  45      n[+        XXC5      nUR-                  SUUR                  S5      5        UUR                  X5      4$ )Nr4   z)Expected 2-dimensional dense tensor, got -dimensional tensorInvalid datatype z of dense matrixr   r   )r   r   z6Invalid number of elements per meta element calculatedr   r   zNumber of rows of dense matrix z must be divisible by 16r   z must be divisible by 32z"Number of columns of dense matrix z must be divisible by r   r5   )dimr6   r
   rE      rD      rF      )r{   RuntimeErrorr7   rf   rJ   rk   rC   rL   halfbfloat16floatint16ri   rO   unbindrj   int64gather	unsqueezestack	new_emptyrw   scatter_)denserl   krf   rn   quadbits_per_meta_elemksparsedense_4m0m1m2m3dense_2rm   expr0expr1expr2bit0bit1bit2bit3idxs0idxs1sparse0sparse1sparsemeta_4meta_nmetameta_reorderedmeta_offsetss                                  r   )sparse_semi_structured_from_dense_cutlassr      s   yy{a7		}DWX
 	
 ;;DA\\FJ{{ejj [[
	U^^U[[%++N	N[[
.u{{m;KLMM'00149V+STTU[[ r6Q;1!4LM  
 r6Q;1!4LM  	A&&'1,03I!NdJdIef
 	
 {{ekk!**Rw7!Q,..r2B**Rw7"a<//33!778JH GEC"HEC2#IEDD5=2D3;DDGGEKK(A-.EDGGEKK(A-.E{{ekk!..U__R%89..U__R%89gw/R8==aaHEOOB$71$<=BB11fMeqj!F[["jABEEjQF"1a7OaAg!#%aAg!#% aAg"$& 	 
 1	$1a7OaAg!#%aAg!#% aAg"$& aAg"$	&
 aAg"$& aAg"$& aAg"$& 	 ^^Q^$56N=	zL A|TYYr];N''677r   c                 p	   U R                  5       S:w  a  [        SU R                  5        S35      eU R                  u  p#U R                  nUR                  5       S:w  a  [        SUR                  5        S35      eUR                  U:w  a  [        SU SUR                   S35      eUR                  nU[
        R                  [
        R                  4;  a  [        SU S	35      eUR                  S
-  S-  nU R                  [
        R                  :w  a  SOSnUR                  u  pX:w  a  [        SU SU 35      eX-  U-  SU-  :w  a  [        SU SX-  U-  S-   S35      e[        X)XT5      n
[
        R                  " UR                  S5      SU
5      R                  X)5      n[
        R                  " X)SU-  4UUS9nUS:X  a  US-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US
-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   GO$US
:X  Ga  US-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US
-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S
4'   US-	  S-  US S 2S S 2S4'   US-	  S-  US S 2S S 2S4'   US -	  S-  US S 2S S 2S!4'   US"-	  S-  US S 2S S 2S4'   US#-	  S-  US S 2S S 2S$4'   US%-	  S-  US S 2S S 2S4'   US&-	  S-  US S 2S S 2S'4'   UR                  S5      [
        R                  " SSU-  U-  U-  US(9S-  R                  SS5      R                  SS5      R                  S5      -   n[
        R                   " US-  U-  4U R                  US9nU R                  [
        R                  :w  a"  UR#                  SXR%                  S5      5        O[UR                  [
        R&                  5      R#                  SXR                  [
        R&                  5      R                  S5      5        UR                  USU-  5      $ ))Nr4   z*Expected 2-dimensional sparse tensor, got ry   z(Expected 2-dimensional meta tensor, got zExpected meta matrix to be on z device, got matrix on z devicerz   z of meta matrixr   r   zNumber of rows of meta matrix z5 must be equal to number of columns of sparse matrix z#Number of columns of sparse matrix z different from the z<, expected according to the number of columns of meta matrixr   r   )rC   rf   r6   r5   rD   
   rE   r
      rF   r      	   r|         r}         r~         re   )r{   r   r7   rf   rC   rJ   r   rL   ri   r   rw   r   rO   emptyrg   rh   zerosr   r8   r   )r   r   rl   r   rf   rn   r   r   
meta_nrowsrm   r   r   meta_2dense_offsetsr   s                  r   'sparse_semi_structured_to_dense_cutlassr   j  s.   zz|q8FYZ
 	
 <<DA]]Fq 6~7I7I7K6LL_`
 	
 &,VH4KNLaLaKbbij
 	
  %%J%++u{{33.zl/JKK'00149<<5;;.aAG+11J,ZL8mnompq
 	
 44A=1!4HI]`vIvz{I{H| }I I
 	
 >	zL <<++B/LAFFqUD [[	
223F
 "+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw	1	$+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!RxKKOQA	W,V<q@
d2qk&&A,ttBx(M KKQFLLHE||u{{"q-);<

5::''}kk%**5::2>	
 ::aQr   )dataclassesr   r   typingr   r   rJ   r	   r)   r#   r*   r   Tensorr;   r@   r]   rc   rw   r   r   r   r   r   <module>r      sb   )   $X X X 	
c 
c 
$ 

	  ,,	
  \\L 

!,,!! ! ,,	!
 ! \\!H5/3 5/5tCy$s))K#L 5/p//
5<<u||34/BSB}8H^ r   