
    h              &          S SK r S SKJr  S SKrS SKJr  \R
                  R                  SS5      r\R                  S5        \R                  S5        \R                  S5        \R                  S	5        \R                  S
5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S5        \R                  S\R                  R                  R                  /S9  \R                  S\R                  R                  R                  /S9  \R                  S5        \R                  S5        \R                  S5        \R                  S5        S rS r\ R                  S 5       r SS\S\S\S \S!\S"\S#\4S$ jjr\" S%5       SS\S\S\S \S!\S"\S#\4S& jj5       r              SS'\S(\S)\S*\\   S+\S,\S-\\   S.\S/\S0\S1\S2\S3\S4\S5\S6\S7\S#\4$S8 jjr\" S95                    SS'\S(\S)\S*\\   S+\S,\S-\\   S.\S/\S0\S1\S2\S3\S4\S5\S6\S7\S#\4$S: jj5       rS;\S<\S#\4S= jr\" S>5      S;\S<\S#\4S? j5       rS;\S@\SA\S<\S#\4
SB jr\" SC5      S;\S@\SA\S<\S#\4
SD j5       rSE\SF\SG\SH\SI\SJ\SK\SL\SM\S#\4SN jr\" SO5      SE\SF\SG\SH\SI\SJ\SK\SL\SM\S#\4SP j5       rSE\SF\SQ\SR\SS\SI\SK\SL\SM\S#\4ST jr\" SU5      SE\SF\SQ\SR\SS\SI\SK\SL\SM\S#\4SV j5       r  SSW\SX\SY\SZ\S[\\   S\\\R4                     S#\4S] jjr\" S^5        SSW\SX\SY\SZ\S[\\   S\\\R4                     S#\4S_ jj5       r  SSW\SX\SY\SZ\S[\\   S\\\R4                     S#\4S` jjr\" Sa5        SSW\SX\SY\SZ\S[\\   S\\\R4                     S#\4Sb jj5       r  SSW\SX\SY\Sc\SZ\S[\\   S\\\R4                     S#\4Sd jjr\" Se5        SSW\SX\SY\Sc\SZ\S[\\   S\\\R4                     S#\4Sf jj5       rSY\S#\\44Sg jr\" Sh5      SY\S#\\44Si j5       r  SSj\Sk\Sl\Sm\S#\\44
Sn jjr \" So5        SSj\Sk\Sl\Sm\4Sp jj5       r     SSq\SG\Sr\S4\\   Ss\\   St\Su\Sv\S#\4Sw jjr!\" Sx5           SSq\SG\Sr\S4\\   Ss\\   St\Su\Sv\4Sy jj5       rSz\S{\S|\S}\S#\4
S~ jr"\" S5      Sz\S{\S|\S}\S#\4
S j5       rSz\S{\S|\S}\S\S\S[\\   S\\   S\\\R4                     S#\4S jr#\" S5      Sz\S{\S|\S}\S\S\S[\\   S\\   S\\\R4                     S#\4S j5       r\ R                  " 5       S 5       r$S r%\" S5      S\S\S\S\4S j5       r&S\S\S\S\4S jr'\" S5      S\S\S\S\4S j5       r(SY\S\S\S#\4S jr)\" S5      SY\S\S\S#\4S j5       rSW\S\S\SY\S\S\S\S[\\   S\\R4                  4S jr*\" S5      SW\S\S\SY\S\S\S\S[\\   S\\R4                  S#\4S j5       r\" S5      S\S\S\S\S6\S\S\S#\4S j5       rg)    N)Optional)TensortorchaoFRAGMENTzuquant_llm_linear(int EXPONENT, int MANTISSA, Tensor _in_feats, Tensor _weights, Tensor _scales, int splitK) -> TensorzMunpack_tensor_core_tiled_layout(Tensor packed_w, int inner_k_tiles) -> Tensorzzdequantize_tensor_core_tiled_layout(Tensor packed_w, Tensor scales_and_zeros, int group_size, int inner_k_tiles) -> Tensorzmarlin_24_gemm(Tensor x, Tensor weight_marlin, Tensor meta, Tensor s, Tensor workspace, int bits, int size_m, int size_n, int size_k) -> Tensorzmarlin_qqq_gemm(Tensor x, Tensor weight_marlin, Tensor s_tok, Tensor s_ch, Tensor s_group, Tensor workspace, int size_m, int size_n, int size_k) -> Tensorzrowwise_scaled_linear_cutlass_s8s4(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensorzrowwise_scaled_linear_cutlass_s4s4(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensorzrowwise_scaled_linear_sparse_cutlass_f8f8(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_meta, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> TensorzLto_sparse_semi_structured_cutlass_sm9x_f8(Tensor weight) -> (Tensor, Tensor)zsparse24_sm90_sparsify(Tensor input, str metadata_fmt, str activation, str sp_selection_algo, *, ScalarType? dtype = None, Tensor? scale=None) -> (Tensor, Tensor)zsparse24_fp8_sm90_cutlass_gemm(Tensor a, Tensor a_mdata, Tensor b, *, Tensor? a_scale = None, Tensor? b_scale = None, int swizzle_size=8, str swizzle_axis='n', int sm_count=128) -> Tensorz\swizzle_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled) -> Tensorzswizzle_scaled_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None) -> TensorzImx_fp8_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensor)tagszImx_fp4_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensora6  qscaled_dot_product(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, float? scale=None, float q_scale=1.0, int q_zp=0, float k_scale=1.0, int k_zp=0, float v_scale=1.0, int v_zp=0, float a_scale=1.0, int a_zp=0, float o_scale=1.0, int o_zp=0) -> Tensorzida8w4_linear_prepack_cpu(Tensor weight, Tensor scales, Tensor qzeros) -> (Tensor, Tensor, Tensor, Tensor)zda8w4_linear_cpu(Tensor input, Tensor input_scales, Tensor input_qzeros, Tensor weight, Tensor weight_scales, Tensor weight_qzeros, Tensor compensation, Tensor? bias, ScalarType output_dtype) -> Tensorz_scaled_embedding_bag(Tensor qweight, Tensor indices, Tensor offsets, Tensor weight_scale, float o_scale, int mode, bool include_last_offset) -> Tensorc                    ^  U 4S jnU$ )Nc                 P   > [         R                  R                  T 5      " U 5      $ N)torchlibraryregister_fakefuncnames    E/home/james-whalen/.local/lib/python3.13/site-packages/torchao/ops.py	decorator%register_custom_op.<locals>.decoratorM   s    }}**dV5d;;     r   r   s   ` r   register_custom_opr   L   s    < r   c                    ^  U 4S jnU$ )Nc                 N   > [         R                  R                  T SS9" U 5      $ )Nr   )mutates_args)r   r   	custom_opr   s    r   r   *register_custom_op_impl.<locals>.decoratorT   s$    }}&&$r&B4HHr   r   r   s   ` r   register_custom_op_implr   S   s    I r   c                      [         R                  R                  [         R                  R                  5       5      n U R                  S-  U R
                  -   nU$ )N
   )r   cudaget_device_propertiescurrent_devicemajorminor)device_propscompute_capabilitys     r   cached_compute_capabilityr'   Z   sE    ::33EJJ4M4M4OPL%++b0<3E3EEr   EXPONENTMANTISSA	_in_feats_weights_scalessplitKreturnc                    ^ [        5       m[        R                  " TS:  U4S j5        [        R                  R                  R
                  R                  XX#XE5      $ )aG  
Quant-LLM linear layer A @ W.T. See https://arxiv.org/abs/2401.14112 for more details.

Arguments
    EXPONENT: number of exponent bits
    MANTISSA: number of mantissa bits
    _in_feats: input activations in FP16
    _weights: packed Floatx weights
    _scales: scale
    splitK: split K

Returns
    output of linear layer
K   c                     > ST  3$ )NzLquant_llm_linear requires sm7.5+ GPU architecture, but current device has smr   )r&   s   r   <lambda>"quant_llm_linear.<locals>.<lambda>{   s    ^_q^rsr   )r'   r   _checkopsr   quant_llm_lineardefault)r(   r)   r*   r+   r,   r-   r&   s         @r   r6   r6   a   sQ    . 34	LLb s 99--55I r   ztorchao::quant_llm_linearc                   ^^^ [         R                  " TR                  5       S:H  U4S j5        [         R                  " TR                  [         R                  [         R
                  4;   U4S j5        [         R                  " TR                  5       S:H  U4S j5        [         R                  " TR                  [         R                  L U4S j5        [         R                  " TR                  5       S:H  U4S j5        [         R                  " TR                  [         R                  [         R
                  4;   U4S j5        TR                  u  pgTR                  u  pSU -   U-   n
[         R                  " US	-  U
-  TR                  S   :H  S
 5        [         R                  " UTR                  S   :H  S 5        TR                  Xh45      $ )N   c                  ,   > ST R                  5        S3$ )Nz!input should be a 2d tensor, got Ddimr*   s   r   r2   _.<locals>.<lambda>   s    3IMMO3DAFr   c                  "   > ST R                    3$ )Nz!weight must be FP16 or BF16, got dtyper>   s   r   r2   r?      s    3IOO3DEr   c                  ,   > ST R                  5        S3$ )Nz"weight should be a 2d tensor, got r;   r<   r+   s   r   r2   r?      s    4X\\^4DAFr   c                  "   > ST R                    3$ )Nzweight must be UINT8, got rA   rD   s   r   r2   r?          ,X^^,<=r      c                  ,   > ST R                  5        S3$ )Nz!scale should be a 2d tensor, got r;   r<   r,   s   r   r2   r?      s    &GVW$Xr   c                  "   > ST R                    3$ )Nz scale must be FP16 or BF16, got rA   rI   s   r   r2   r?      s    27==/Br      c                      gNzDimensions mismatchedr   r   r   r   r2   r?      s    @Wr   r   c                      grM   r   r   r   r   r2   r?      s    1Hr   )	r   r4   r=   rB   float16bfloat16uint8shape	new_empty)r(   r)   r*   r+   r,   r-   BSICOC_N_BITSs     ```      r   rW   rW      sQ    
LL1F 
LLEMM5>>::E 
LL!F 
LL%++%= 
LLX 
LL%--88B
 __FBNNEB\H$F	LLq6!X^^A%668WX	LLw}}Q'')HIx((r   querykeyvalue	attn_mask	dropout_p	is_causalscaleq_scaleq_zpk_scalek_zpv_scalev_zpa_scalea_zpo_scaleo_zpc                     [         R                  R                  R                  R	                  U UUUUUUUUU	U
UUUUUU5      $ )a  
Quantized SDPA with quantized inputs and outputs.
Arguments
    query: input query tensor,
    key: input key tensor,
    value: input value tensor,
    attn_mask: attention mask tensor,
    dropout_p: dropout probability,
    is_causal: causal flag,
    scale: scaling factor applied prior to softmax,
    q_scale: scale for query from linear quantization,
    q_zp: zero point for query from linear quantization,
    k_scale: scale for key from linear quantization,
    k_zp: zero point of key from linear quantization,
    v_scale: zero point for value from linear quantization,
    v_zp: zero point of value from linear quantization,
    a_scale: scale for attention from softmax quantization,
    a_zp: zero point for attention from softmax quantization,
    o_scale: scale for output from linear quantization,
    o_zp: zero point for output from linear quantization,
Returns
    output of quantized SDPA
)r   r5   r   qscaled_dot_productr7   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   s                    r   rk   rk      s[    T 990088# r   ztorchao::qscaled_dot_productc                     U $ r
   r   rl   s                    r   rW   rW      s	    ( Lr   packed_winner_k_tilesc                 d    [         R                  R                  R                  R	                  XS9$ )a  
Unpacks weights that were packed with `torch.ops.aten._convert_weight_to_int4pack` to original tensor of shape `N x K`.

Assumes that the packed weights were generated with `torch.ops.aten._convert_weight_to_int4pack` with `inner_k_tiles = 2 | 4 | 8`"

Args:
    packed_w: torch.tensor: 4D tensor with shape (N / 8) x (K / (inner_k_tiles * 16)) x 32 x inner_k_tiles, dtype is torch.int32
    inner_k_tiles: int

Returns:
    torch.tensor of shape is N x K, dtype is torch.int32

rn   ro   )r   r5   r   unpack_tensor_core_tiled_layoutr7   rq   s     r   rr   rr     s0     99<<DD E  r   z(torchao::unpack_tensor_core_tiled_layoutc                   ^  [         R                  " T R                  5       S:H  U 4S j5        [         R                  " T R                  [         R                  L U 4S j5        [         R                  " US:H  =(       d    US:H  =(       d    US:H  S 5        [         R                  " T R                  S5      S:H  S 5        [         R                  " T R                  S	5      US-  :H  S
 5        T R                  S5      S-  nT R                  S5      U-  S-  n[         R                  " X#4[         R                  T R                  S9$ )N   c                  ,   > ST R                  5        S3$ )Nz*packed weight should be a 42d tensor, got r;   r<   rn   s   r   r2   r?     s    <X\\^<LANr   c                  "   > ST R                    3$ Nzweight must be INT32, got rA   rv   s   r   r2   r?     rF   r   r9   rK   c                      gNz inner_k_tiles must be 2, 4, or 8r   r   r   r   r2   r?   !      2r       c                      gNz#packed weight must have 32 at dim 2r   r   r   r   r2   r?   #      1Vr      c                      gNz0packed weight must have inner_k_tiles/2 at dim 3r   r   r   r   r2   r?   &      Br   r   rG      rB   device)r   r4   r=   rB   int32sizeemptyr   )rn   ro   NKs   `   r   rW   rW     s    	LL!N 
LL%++%= 
LLFmq0FMQ4F2 
LLq!R')VW	LLaMA--B 	a1Aa=(2-A;;vU[[IIr   scales_and_zeros
group_sizec                 j    [         R                  R                  R                  R	                  XX#5      $ )a  
Dequantizes by:
- Unpacking weights that were packed with `torch.ops.aten._convert_weight_to_int4pack` to original tensor of shape `N x K`
- Upcasting to bfloat16
- Dequantizing with the scales_and_zeros that were packed with `torchao.quantization.utils.pack_tinygemm_scales_and_zeros`

Assumes:
- packed weights were generated with `torch.ops.aten._convert_weight_to_int4pack` with `inner_k_tiles = 2 | 4 | 8`"
- packed scales_and_zeros were generated with `torchao.quantization.utils.pack_tinygemm_scales_and_zeros`
- qGroupSize is 32 | 64 | 128 | 256

Args:
    packed_w: torch.tensor: 4D tensor with shape `(N / 8) x (K / (inner_k_tiles * 16)) x 32 x inner_k_tiles / 2`, dtype is torch.int32
    scales_and_zeros: torch.tensor: 3D tensor with shape `numQGroups x N x 2`, dtype is torch.bfloat16 where numQGroups is K / qGroupSize
    group_size: int
    inner_k_tiles: int

Returns:
    torch.tensor of shape is N x K, dtype is torch.bfloat16

)r   r5   r   #dequantize_tensor_core_tiled_layoutr7   )rn   r   r   ro   s       r   r   r   .  s,    0 99@@HHJ r   z,torchao::dequantize_tensor_core_tiled_layoutc                   ^  [         R                  " T R                  5       S:H  U 4S j5        [         R                  " T R                  [         R                  L U 4S j5        [         R                  " US:H  =(       d    US:H  =(       d    US:H  S 5        [         R                  " T R                  S5      S:H  S 5        [         R                  " T R                  S	5      US-  :H  S
 5        T R                  S5      S-  nT R                  S5      U-  S-  n[         R                  " UR                  [         R                  L S 5        [         R                  " UR                  5       S	:H  S 5        [         R                  " US:H  =(       d    US:H  =(       d    US:H  =(       d    US:H  S 5        [         R                  " UR                  S5      XR-  :H  S 5        [         R                  " UR                  S5      U:H  S 5        [         R                  " UR                  S5      S:H  S 5        [         R                  " XE4[         R                  T R                  S9$ )Nrt   c                  ,   > ST R                  5        S3$ )Nz)packed weight should be a 4d tensor, got r;   r<   rv   s   r   r2   r?   R  s    ;HLLN;K1Mr   c                  "   > ST R                    3$ rx   rA   rv   s   r   r2   r?   V  rF   r   r9   rK   c                      grz   r   r   r   r   r2   r?   Z  r{   r   r|   c                      gr~   r   r   r   r   r2   r?   \  r   r   r   c                      gr   r   r   r   r   r2   r?   _  r   r   r   rG   r   c                      g)Nz!scales_and_zeros must be bfloat16r   r   r   r   r2   r?   g  s    3r   c                      g)Nz9scales_and_zeros must be 3D, got {scales_and_zeros.dim()}r   r   r   r   r2   r?   k  s    Kr   @         c                      g)Nz&qGroupSize must be 32, 64, 128, or 256r   r   r   r   r2   r?   o  s    8r   c                      g)Nz3scales_and_zeros must have K // qGroupSize at dim 0r   r   r   r   r2   r?   s  s    Er   c                      g)Nz%scales_and_zeros must have N at dim 1r   r   r   r   r2   r?   v      /Vr   c                      g)Nz%scales_and_zeros must have 2 at dim 2r   r   r   r   r2   r?   y  r   r   r   )	r   r4   r=   rB   r   r   rP   r   r   )rn   r   r   ro   r   r   s   `     r   rW   rW   K  s   
 
LL!M 
LL%++%= 
LLFmq0FMQ4F2 
LLq!R')VW	LLaMA--B 	a1Aa=(2-A 
LL%..03 
LL!#K 
LLbVJ",V
c0AVZSVEV8 
LLa AO3E 
LLa A%'V 
LLa A%'V ;;vU^^HOOLLr   xweight_marlinmetas	workspacebitssize_msize_nsize_kc	                 p    [         R                  R                  R                  R	                  XX#XEXgU5	      $ )a  
Sparse Marlin 2:4 matrix multiplication. Reference: https://github.com/IST-DASLab/Sparse-Marlin/tree/main
Args:
    x: input matrix of shape `(n, k/2)` in column-major layout.
    weight_marlin: weight matrix of original shape `(m, k)` in Marlin format; see `Layer.pack()`.
    meta: metadata information for 2:4 sparsity.
    s: scales of shape `(n / groupsize / 2, m)`.
    workspace: tensor with at least `m / 128 * max_par` entries that are all zero.
    bits: number of bits for quantization.
    size_m: number of rows in input matrix.
    size_n: number of columns in weight matrix.
    size_k: number of columns in input matrix.
Returns:
    output matrix of shape `(n, m)` in column-major layout.
)r   r5   r   marlin_24_gemmr7   )	r   r   r   r   r   r   r   r   r   s	            r   r   r     s2    4 99++33	$9FF r   ztorchao::marlin_24_gemmc	                   ^ ^^^^^^^^^^^^^ SmSmSn	[         R                  " TS:H  =(       d    TS:H  U4S j5        ST-  n
[         R                  " TT R                  S5      :H  UU 4S	 j5        [         R                  " TT R                  S
5      :H  UU 4S j5        [         R                  " TT-  S:H  UU4S j5        [         R                  " TT-  S-  TR                  S5      :H  UUU4S j5        [         R                  " TR                  S
5      T:H  UU4S j5        [         R                  " TR                  S
5      T-  S:H  UU4S j5        TR                  S
5      T-  U
-  m[         R                  " TT:H  UU4S j5        [         R                  " TR                  S5      TS-  S-  S-  :H  UU4S j5        [         R                  " TR                  S
5      TS-  :H  UU4S j5        [         R                  " T R                  S 5        [         R                  " T R	                  5       S 5        [         R                  " TR                  S 5        [         R                  " TR	                  5       S 5        [         R                  " TR                  S 5        [         R                  " TR	                  5       S 5        [         R                  " TR                  S 5        [         R                  " TR	                  5       S 5        SmTR                  S5      S
:  aJ  [         R                  " TTR                  S5      -  S:H  UU4S j5        TTR                  S5      -  mTS-  m[         R                  " TS:H  =(       d    TS:H  U4S j5        [         R                  " TT-  S:H  UU4S j5        TT-  U	-  m[         R                  " TR                  5       T:  UU4S  j5        [         R                  " T R                  S5      TR                  S
5      4T R                  T R                  S!9$ )"Nr   r   r   rt   rK   c                     > ST  3$ )Nznum_bits must be 4 or 8. Got = r   )r   s   r   r2   r?     s    *I$(Pr   r|   r   c                  2   > STR                  S5       ST  3$ NzShape mismatch: x.size(0) = r   , size_m = r   r   r   s   r   r2   r?         .qvvaykVHMr   rG   c                  2   > STR                  S5       ST  3$ NzShape mismatch: x.size(1) = rG   , size_k = r   r   r   s   r   r2   r?     r   r   c                     > ST ST  3$ N	size_k = ! is not divisible by tile_size = r   	TILE_SIZEr   s   r   r2   r?         )F8#DYKPr   r9   c                  8   > STR                  S5       ST ST  3$ Nz(Shape mismatch: weight_marlin.size(0) = r   r   z, tile_size = r   r   r   r   s   r   r2   r?     +    :=;M;Ma;P:QQ\]c\ddrs|r}~r   c                  2   > ST R                  S5       ST 3$ )Nzs.size(1) = rG   , size_n = r   )r   r   s   r   r2   r?     s    |AFF1I;k&%Rr   c                  2   > STR                  S5       ST  3$ Nzweight_marlin.size(1) = rG   r   r   r   r   s   r   r2   r?     "    *=+=+=a+@*AAbclbmnr   c                     > ST ST  3$ )N	size_n = , actual_size_n = r   actual_size_nr   s   r   r2   r?     s    )F8#5m_Er   c                  D   > ST R                  S5       STS-  S-  S-   3$ )Nzmeta.size(0) = r   z is not size_k / 8 / 2 / 2 = rK   r9   r   )r   r   s   r   r2   r?     s-    /$))A,/LVWX[\]M]abMbLcdr   c                  8   > ST R                  S5       STS-   3$ )Nzmeta.size(1) = rG   z is not size_n * 2 = r9   r   )r   r   s   r   r2   r?     s    /$))A,/DVaZLQr   c                      gNzx is not on GPUr   r   r   r   r2   r?         $5r   c                      gNzx is not contiguousr   r   r   r   r2   r?         ,Ar   c                      gNzweight_marlin is not on GPUr   r   r   r   r2   r?         0Mr   c                      gNzweight_marlin is not contiguousr   r   r   r   r2   r?         /Pr   c                      g)Nzmeta is not on GPUr   r   r   r   r2   r?         ';r   c                      g)Nzmeta is not contiguousr   r   r   r   r2   r?         /Gr   c                      g)Nzs is not on GPUr   r   r   r   r2   r?     r   r   c                      g)Nzs is not contiguousr   r   r   r   r2   r?     r   r   c                  2   > ST ST R                  S5       3$ )Nr   z! is not divisible by s.size(0) = r   r   )r   r   s   r   r2   r?     s    ix'HPQTr   c                     > ST  3$ NzUnexpected groupsize = r   	groupsizes   r   r2   r?     s    ))5r   c                     > ST ST  3$ Nr   z$ is not divisible by min_thread_n = r   MIN_THREAD_Nr   s   r   r2   r?         )F8#G~Vr   c                  0   > STR                  5        ST  3$ )Nzworkspace.numel =  is below min_workspace_size = numelmin_workspace_sizer   s   r   r2   r?   	  s    $Y__%6$77VWiVjkr   r   )	r   r4   r   is_cudais_contiguousr   r   rB   r   )r   r   r   r   r   r   r   r   r   MAX_PARALLELISMpack_factorr   r   r   r   r   s   `````````  @@@@@r   rW   rW     sL    ILO 
LL	TQY P *K 
LL!&&)M 
LL!&&)M 
LLaP 
LL	9		!m&8&8&;;~ 
LL	q	VR 
LL1	)Q.n
 #''*i7;FM	LL-E 
LL		!!q(A--d 
LL		!
"Q 
LL56	LL"$AB 
LL&&(MN	LL##%'P
 
LL;<	LL##%'GH 
LL56	LL"$AB Ivvay1}QVVAY!#T	
 affQi'	a		LLR*9?5 
LL"V !L0OC	LL//k
 ;;q	166!9-QWWQXXNNr   s_toks_chs_groupc	                 p    [         R                  R                  R                  R	                  XX#XEXgU5	      $ )a  
Marlin for W4A8 mixed precision matrix multiplication.
See https://arxiv.org/pdf/2406.09904 for more details.
Reference: https://github.com/HandH1998/QQQ/tree/main
Args:
    x: `torch.int8` input matrix of shape `(m, k)` in standard row-major layout.
    weight_marlin: `torch.int32` weight matrix of original shape `(k, n)` in the specified format.
    s_tok: `torch.float32` activation per-token quantization scales of shape `(m, 1)`.
    s_ch: `torch.float32` weight per-channel quantization scales of shape `(1, n)`.
    s_group: `torch.half` weight per-group quantization scales of shape `(m / groupsize, n)`, it should be empty when group_size != -1.
    workspace: `torch.int32` tensor with at least `n / 128 * max_par` entries that are all zero.
    size_m: number of rows in input matrix.
    size_n: number of columns in weight matrix.
    size_k: number of columns in input matrix.
Returns:
    `torch.half` out matrix of shape `(m, n)` in standard row-major layout.
)r   r5   r   marlin_qqq_gemmr7   )	r   r   r   r   r   r   r   r   r   s	            r   r   r     s2    8 99,,44	%w66 r   ztorchao::marlin_qqq_gemmc	                 x	  ^ ^^^^^^^^^^^^^ SmSmSn	Sn
[         R                  " TT R                  S5      :H  UU 4S j5        [         R                  " TTR                  5       :H  UU4S j5        [         R                  " TT R                  S5      :H  UU 4S j5        [         R                  " TT-  S:H  UU4S	 j5        [         R                  " TT-  TR                  S5      :H  UUU4S
 j5        TR                  5       S:X  a  SOTTR                  S5      -  m[         R                  " TS;   U4S j5        [         R                  " TR                  5       T:H  UU4S j5        [         R                  " TR                  S5      T-  S:H  UU4S j5        TS:w  a_  [         R                  " TR                  S5      T:H  UU4S j5        [         R                  " TTR                  S5      -  S:H  UU4S j5        TR                  S5      T-  U
-  m[         R                  " TT:H  UU4S j5        [         R                  " T R                  S 5        [         R                  " T R                  5       S 5        [         R                  " TR                  S 5        [         R                  " TR                  5       S 5        [         R                  " TR                  S 5        [         R                  " TR                  5       S 5        [         R                  " TR                  [         R                  :H  S 5        [         R                  " TR                  S 5        [         R                  " TR                  5       S 5        [         R                  " TR                  [         R                  :H  S 5        [         R                  " TR                  S 5        [         R                  " TR                  5       S 5        [         R                  " TR                  [         R                  :H  S5        [         R                  " TT-  S:H  UU4S  j5        TT-  U	-  m[         R                  " TR                  5       T:  UU4S! j5        [         R                  " TT4[         R                  T R                  S"9$ )#Nr   r   rK   r   c                  2   > STR                  S5       ST  3$ r   r   r   s   r   r2   r?   D  r   r   c                  0   > ST R                  5        ST 3$ )Nz Shape mismatch: s_tok.numel() = r   r   )r   r   s   r   r2   r?   H  s    25;;=/VHUr   rG   c                  2   > STR                  S5       ST  3$ r   r   r   s   r   r2   r?   N  r   r   c                     > ST ST  3$ r   r   r   s   r   r2   r?   R  r   r   c                  8   > STR                  S5       ST ST  3$ r   r   r   s   r   r2   r?   V  r   r   r   )r   r   c                     > ST  3$ r   r   r   s   r   r2   r?   [  s    3J9+1Vr   c                  0   > ST R                  5        ST 3$ )NzShape mismatch: s_ch.numel() = r   r   )r   r   s   r   r2   r?   `  s    1$**,{6(Sr   c                  2   > STR                  S5       ST  3$ r   r   r   s   r   r2   r?   d  r   r   c                  2   > ST R                  S5       ST 3$ )Nz"Shape mismatch: s_group.size(1) = rG   r   r   )r   r   s   r   r2   r?   i  s    8a8IU[T\]r   c                  2   > ST ST R                  S5       3$ )Nr   z' is not divisible by s_group.size(0) = r   r   )r   r   s   r   r2   r?   m  s    ix'Nw||\]N_`r   c                     > ST ST  3$ )NzShape mismatch: size_n = r   r   r   s   r   r2   r?   s  s    +F83Em_Ur   c                      gr   r   r   r   r   r2   r?   w  r   r   c                      gr   r   r   r   r   r2   r?   x  r   r   c                      gr   r   r   r   r   r2   r?   {  r   r   c                      gr   r   r   r   r   r2   r?   }  r   r   c                      g)Nzs_tok is not on GPUr   r   r   r   r2   r?     s    (=r   c                      g)Nzs_tok is not contiguousr   r   r   r   r2   r?     s    0Ir   c                      g)Nzs_tok's dtype is not float32r   r   r   r   r2   r?     s    7Ur   c                      g)Nzs_ch is not on GPUr   r   r   r   r2   r?     r   r   c                      g)Nzs_ch is not contiguousr   r   r   r   r2   r?     r   r   c                      g)Nzs_ch's dtype is not float32r   r   r   r   r2   r?     s    6Sr   c                      g)Nzs_group is not on GPUr   r   r   r   r2   r?     s    *Ar   c                      g)Nzs_group is not contiguousr   r   r   r   r2   r?     s    2Mr   zs_group's dtype is not float16c                     > ST ST  3$ r   r   r   s   r   r2   r?     r   r   c                  0   > STR                  5        ST  3$ )Nzworkspace.numel() = r   r   r   s   r   r2   r?     s    &y'8&99XYkXlmr   r   )r   r4   r   r   r   r   rB   float32rO   r   r   )r   r   r   r   r   r   r   r   r   r   PACK_FACTORr   r   r   r   r   s   `````````  @@@@@r   rW   rW   0  sk    ILOK 
LL!&&)M 
LL%++-U 
LL!&&)M 
LLaP 
LL	9	!3!3A!66~ mmo*',,q/0II	LLi')VW 
LL

S 
LL1	)Q.n BLLOv%]	
 	W\\!_$)`	

 #''*i7;FM	LL-U 
LL56	LL"$AB 
LL&&(MN	LL##%'P
 
LL =>	LL$$&(IJ	LL-/UV 
LL;<	LL##%'GH	LLu}},.ST 
LL"AB	LL&&(*MN	LL%--/1QR 
LL"V !L0OC	LL//m
 ;;'u}}QXXNNr   inputinput_scaleweightweight_scalebias	out_dtypec                 r    [         R                  R                  R                  R	                  U UUUUU5      $ )a  
CUTLASS-based row-wise scaled W4A8 linear operator.
Args:
    input: quantized input tensor, in row-major layout.
    input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
    weight: quantized weight matrix, in row-major layout.
    weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
    bias: an optional vector of size equal to number of rows of weight tensor, or None.
    out_dtype: optional data type for output tensor.
Returns:
    output: result tensor, in row-major layout.
)r   r5   r   "rowwise_scaled_linear_cutlass_s8s4r7   r  r  r  r  r  r  s         r   r  r    s9    * 99??GG r   z+torchao::rowwise_scaled_linear_cutlass_s8s4c                     Ub  UOUR                   nU R                  n[        R                  " / U R                  S S QUR                  S   P7XgS9$ Nr   r   r   rB   r   r   r   rR   r  r  r  r  r  r  rB   r   s           r   rW   rW     N     #.IK4E4EE\\F;;;Sb);6<<?;5XXr   c                 l    [         R                  R                  R                  R	                  XX#XE5      $ )a  
CUTLASS-based row-wise scaled W4A4 linear operator.
Args:
    input: quantized input tensor, in row-major layout.
    input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
    weight: quantized weight matrix, in row-major layout.
    weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
    bias: an optional vector of size equal to number of rows of weight tensor, or None.
    out_dtype: optional data type for output tensor.
Returns:
    output: result tensor, in row-major layout.
)r   r5   r   "rowwise_scaled_linear_cutlass_s4s4r7   r  s         r   r&  r&    s.    * 99??GGF$ r   z+torchao::rowwise_scaled_linear_cutlass_s4s4c                     Ub  UOUR                   nU R                  n[        R                  " / U R                  S S QUR                  S   P7XgS9$ r!  r"  r#  s           r   rW   rW     r$  r   weight_metac           	      n    [         R                  R                  R                  R	                  XX#XEU5      $ )a  
CUTLASS-based row-wise scaled F8F8 linear operator, for sparsified weight case.
Args:
    input: quantized input tensor, in row-major layout.
    input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
    weight: sparsified quantized weight matrix, in row-major layout.
    weight_meta: sparsify metadata for weight tensor.
    weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
    bias: an optional vector of size equal to number of rows of weight tensor, or None.
    out_dtype: optional data type for output tensor.
Returns:
    output: result tensor, in row-major layout.
)r   r5   r   )rowwise_scaled_linear_sparse_cutlass_f8f8r7   )r  r  r  r(  r  r  r  s          r   r*  r*    s0    . 99FFNNFY r   z2torchao::rowwise_scaled_linear_sparse_cutlass_f8f8c                     Ub  UOUR                   nU R                  n[        R                  " / U R                  S S QUR                  S   P7XxS9$ r!  r"  )	r  r  r  r(  r  r  r  rB   r   s	            r   rW   rW     sN     #.IK4E4EE\\F;;;Sb);6<<?;5XXr   c                 h    [         R                  R                  R                  R	                  U 5      $ )a  
CUTLASS-based conversion from sparsified input tensor to corresponding compressed tensor, along with corresponding metadata tensor.
Args:
    weight: input tensor, in row-major layout.
Returns:
    weight_compressed: compressed weight tensor, with sparsity eliminated, in row-major layout.
    weight_meta: metadata tensor, describing the sparsity structure of the input tensor, also in row-major layout.
)r   r5   r   )to_sparse_semi_structured_cutlass_sm9x_f8r7   r  s    r   r-  r-  &  s%     99FFNNvVVr   z2torchao::to_sparse_semi_structured_cutlass_sm9x_f8c                     U R                  U S   U S   S-  5      U R                  U S   [        U S   S-  S5      [        R                  S94$ )Nr   rG   r9   rK   r   rA   )rS   maxr   charr.  s    r   rW   rW   5  sV     	F1IN3Cq	Q$;5::N r   input_tensormetadata_format
activation	algorithmc           	      T    [         R                  R                  R                  XX#XES9$ )N)rB   r_   )r   r5   r   sparse24_sm90_sparsify)r2  r3  r4  r5  rB   r_   s         r   r7  r7  B  s-     9933zE 4  r   ztorchao::sparse24_sm90_sparsifyc                 <   Ub  UOU R                   n[        R                  " U R                  S   U R                  S   S-  4UU R                  S9[        R                  " U R                  S   U R                  S   S-  4[        R
                  U R                  S94$ )Nr   rG   r9   r   rK   )rB   r   r   rR   r   rQ   )r2  r3  r4  r5  rB   r_   r  s          r   rW   rW   O  s     *0B0BI"L$6$6q$9Q$>?&&	

 	"L$6$6q$9Q$>?++&&	
 r   abb_scaleswizzle_sizeswizzle_axissm_countc                 ^    [         R                  R                  R                  U UUUUUUUS9$ )N)rf   r;  r<  r=  r>  )r   r5   r   sparse24_fp8_sm90_cutlass_gemmr9  r   r:  rf   r;  r<  r=  r>  s           r   r@  r@  g  s>     99;;		!! < 	 	r   z'torchao::sparse24_fp8_sm90_cutlass_gemmc                     [         R                  " U R                  S   UR                  S   4[         R                  U R                  S9$ )Nr   rG   r   )r   r   rR   rP   r   rA  s           r   rW   rW   }  s4     ;;
AGGAJ/u~~ahhWWr   mat1mat2mat1_is_swizzledmat2_is_swizzledc                 j    [         R                  R                  R                  R	                  XX#5      $ zH
Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.

)r   r5   r   
swizzle_mmr7   rC  rD  rE  rF  s       r   rI  rI    s-     99''//$ r   ztorchao::swizzle_mmc                 Z    U R                  U R                  S   UR                  S   5      $ Nr   rG   rS   rR   rJ  s       r   rW   rW     s%     >>$**Q-A77r   scale_ascale_bscale_resultc	                 x    [         R                  R                  R                  R	                  U UUUUUUUU5	      $ rH  )r   r5   r   swizzle_scaled_mmr7   	rC  rD  rE  rF  rN  rO  r  rP  r  s	            r   rR  rR    sB     99..66
 
r   ztorchao::swizzle_scaled_mmc	                 Z    U R                  U R                  S   UR                  S   5      $ rL  rM  rS  s	            r   rW   rW     s%     >>$**Q-A77r   c                      [        [        S5      (       a   [        R                  [        R                  4$ [        R                  4$ )zGTODO: when e8m0 is hardened and major release lets remove uint8 supportfloat8_e8m0fnu)hasattrr   rQ   rV  r   r   r   _get_dtypesrX    s3     u&''U1122KK>r   c                    ^ ^ [        5       n[        R                  " T R                  U;   U 4S j5        [        R                  " TR                  U;   U4S j5        g )Nc                  "   > ST R                    3$ )Nz4A_scale tensor must be uint8 or float8_e8m0fnu, got rA   )A_scales   r   r2   %_check_scale_dtypes.<locals>.<lambda>      Fw}}oVr   c                  "   > ST R                    3$ )Nz4B_scale tensor must be uint8 or float8_e8m0fnu, got rA   )B_scales   r   r2   r\    r]  r   )rX  r   r4   rB   )r[  r_  allowed_dtypess   `` r   _check_scale_dtypesra    sB     ]N	LL'V 
LL'Vr   ztorchao::mx_fp8_bf16ABr[  r_  c                     [         R                  " U R                  S5      UR                  S5      4[         R                  U R                  S9$ )zMeta impl for mx_fp8_bf16r   rG   r   r   r   r   rP   r   rb  rc  r[  r_  s       r   meta_mx_fp8_bf16rg    s4     ;;q	166!9-U^^AHHUUr   c                     [        X#5        [        R                  R                  R                  R                  XX#5      $ )a	  Defines a matmul between two fp4 tensors w/ MX scales in E8MO and returns a bf16 tensor.

The expected format is fp4_e2m1 specified:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final.pdf (Section 5.3.3)

Note: The mx scales are E8MO tensors stored in uint8 tensors (for now).
    The layout of the scales is very particular, see:
    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout


Args:
    A: fp4 tensor (2 fp4 elements are packed into 1 byte -> elem0|elem1)
    B: fp4 tensor (2 fp4 elements are packed into 1 byte -> elem0|elem1)
    A_scale: E8M0 scale tensor for A with groupsize=32 in swizzled layout
    B_scale: E8M0 scale tensor for B with groupsize=32 in swizzled layout

Returns:
    MXN bf16 Tensor

)ra  r   r5   r   mx_fp4_bf16r7   rf  s       r   ri  ri    s/    * )99((00wHHr   ztorchao::mx_fp4_bf16c                     [         R                  " U R                  S5      UR                  S5      4[         R                  U R                  S9$ )zMeta impl for mx_fp4_bf16r   rG   r   re  rf  s       r   meta_mx_fp4_bf16rk    s4     ;;q	166!9-U^^AHHUUr   scalesqzerosc                 j    [         R                  R                  R                  R	                  XU5      $ )z
Prepack weights for DA8W4 linear operator on CPU.
Args:
    weight: weight tensor.
    scales: scales for weight tensor.
    qzeros: zero points for weight tensor.
Returns:
    packed weight, scales, and zero points.
)r   r5   r   da8w4_linear_prepack_cpur7   r  rl  rm  s      r   ro  ro    s'     9955==ffUUr   z!torchao::da8w4_linear_prepack_cpuc                 2    XU[         R                  " 5       4$ r
   )r   r   rp  s      r   rW   rW     s    65<<>11r   input_scalesinput_qzerosweight_scalesweight_qzeroscompensationc	                 x    [         R                  R                  R                  R	                  U UUUUUUUU5	      $ )a  
DA8W4 linear operator on CPU.
Args:
    input: input tensor.
    input_scales: scales for input tensor.
    input_qzeros: zero points for input tensor.
    weight: weight tensor.
    weight_scales: scales for weight tensor.
    weight_qzeros: zero points for weight tensor.
    compensation: compensation tensor for weight.
    bias: optional bias tensor.
    out_dtype: output data type.
Returns:
    output tensor in out_dtype.
)r   r5   r   da8w4_linear_cpur7   )	r  rr  rs  r  rt  ru  rv  r  r  s	            r   rx  rx    sB    4 99--55
 
r   ztorchao::da8w4_linear_cpuc	                     UR                  5       S:X  d   eUR                  S5      UR                  S5      -  S-  n	U R                  " / U R                  S S QU	P7SU06$ )Nrt   r   r   r9   r   rB   )r=   r   rS   rR   )
r  rr  rs  r  rt  ru  rv  r  r  r   s
             r   rW   rW   A  s^     ::<1AQ'!+A??AEKK,AaAyAAr   ztorchao::_scaled_embedding_bagqweightindicesoffsetsw_scalesmodeinclude_last_offsetc                     US:X  d   eUR                   S   S-
  nU R                  XpR                   S   U R                  S9$ )NTr   rG   rA   )rR   rS   rB   )rz  r{  r|  r}  rh   r~  r  
batch_sizes           r   rW   rW   R  sH     $&&&q!A%JZq)9OOr   )rG   )Ng        FN      ?r   r  r   r  r   r  r   r  r   )NN)NNrK   nr   )+	functoolstypingr   r   r   r   Librarylibdefine_CTagneeds_fixed_stride_orderr   r   	lru_cacher'   intr6   rW   floatboolrk   rr   r   r   r   rB   r  r&  r*  r-  strr7  r@  rI  rR  rX  ra  rg  ri  rk  ro  rx  r   r   r   <module>r     s      mmIz2 

{ 

S 

 A 

 V 

 a 

 h 

 h 

 C 

R 

 i 

 B 

b 

 T
 

O
((,,
/
/	0   

O
((,,
/
/	0   

 } 

o 

 P 

 ^
     	
   B /0 &)&)&) &) 	&)
 &) &) &) 1&)Z #'!#<<	< < 	<
 < < E?< < < < < < < < <  !<" #<$ %<~ 23
 #'!#	  	
   E?          !" #$ % 4,f S V & >?J Js Jv J @J0(.<?PS: BC0M0M(.0M<?0MPS0M0M D0Mf  	
      > -.mOmOmO mO 	mO
 mO mO mO mO mO mO /mO`  	
      B ./iOiOiO iO 	iO
 iO iO iO iO iO iO 0iOb "'+  	
 6
 $ > AB "'+YYY Y 	Y
 6
Y $Y Y CY* "'+  	
 6
 $ 4 AB "'+YYY Y 	Y
 6
Y $Y Y CY, "'+  	
  6
 $ 8 HI "'+YYY Y 	Y
 Y 6
Y $Y Y JY"WWfW HI		f	 J	" 



 
 	
 f
 56 
  	 76 !% $
  f	
 f    , =>
 !% $
X
X

X 
X f	
X
 f
X 
X 
X 
X ?
X	
		26	JN		 )*8
88268JN88 +8

  	
   6
 6" $ 8 018
8
8 8 	8
 8 8 6
8 6"8 $8 8 28  
 *+V V6 VF VV V ,V
I6 If Iv I I2 *+V V6 VF VV V ,VVVV V 	V" 782f 2f 2f 2 2 92$$$ $ 	$
 $ $ $ 6
$ {{$N /0BBB B 	B
 B B B 6
B {{B B 1B  45PPP P 	P
 P P P P 6Pr   