
    hX                     H   S SK JrJr  S SKrS SKJs  Jr  S SKJ	r	  S SK
JrJr  S SKJrJrJrJrJrJr  S SKJrJr  S SKJr  S SKJr  S	S
KJrJrJr  S	SKJ r   S	SK!J"r"   " S S\R
                  RF                  5      r$ S)S\R
                  RJ                  S\&4S jjr'S\R
                  RJ                  4S jr( " S S\5      r) " S S\)5      r* " S S\$5      r+S\R
                  RJ                  4S jr,S\R
                  RJ                  4S jr-S\R\                  S\4S jr/S\0S\R\                  S\4S jr1 " S  S!\)5      r2 " S" S#\$5      r3S\R
                  RJ                  4S$ jr4S\R
                  RJ                  4S% jr5S\0S\R\                  S\4S& jr6 " S' S(\)5      r7g)*    )AnyOptionalN)	is_device)PerGroupPerRow)Int8DynActInt4WeightLinearWeightOnlyInt4Linear_check_linear_int4_k_replace_linear_8da4w_replace_linear_int4 groupwise_affine_quantize_tensor)TorchAODTypeZeroPointDomain)TwoStepQuantizer)get_group_qparams_symmetric   )FakeQuantizeConfigBaseFloat8FakeQuantizeConfigIntxFakeQuantizeConfig)FakeQuantizerBase)_get_qmin_qmaxc                   *  ^  \ rS rSrSr   SS\S\S\S\\   S\\   S	S4U 4S
 jjjr	S\
R                  S	\
R                  4S jrS	\
R                  R                  4S jr\  SS\
R                  R                  S\\   S\\   4S jj5       rSrU =r$ )FakeQuantizedLinear*   aY  
General linear layer with fake quantized weights and activations.

Specific target dtypes, granularity, schemes etc. are specified
through separate configs for weights and activations.

Example usage::

    activation_config = IntxFakeQuantizeConfig(
        dtype=torch.int8,
        granularity="per_token",
        is_symmetric=False,
    )
    weight_config = IntxFakeQuantizeConfig(
        dtype=torch.int4,
        group_size=8,
        is_symmetric=True,
    )
    fq_linear = FakeQuantizedLinear(
        16, 32, False, activation_config, weight_config,
    )
    fq_linear(torch.randn(16))
Nin_featuresout_featuresbiasactivation_configweight_configreturnc                   > [         T	U ]  " UUU/UQ70 UD6  [        R                  R	                  S5        Ub  [
        R                  " U5      U l        OS U l        Ub{  [        U[        5      (       aJ  [        UR                  [        5      (       a+  UR                  nUb  X-  S:w  a  [        SU< SU< S35      e[
        R                  " U5      U l        g S U l        g )Nz,torchao.quantization.qat.FakeQuantizedLinearr   zin_features (z) % group_size (z) must be == 0)super__init__torch_C_log_api_usage_oncer   from_configactivation_fake_quantizer
isinstancer   granularityr   
group_size
ValueErrorweight_fake_quantizer)
selfr   r   r   r   r   argskwargsr+   	__class__s
            Y/home/james-whalen/.local/lib/python3.13/site-packages/torchao/quantization/qat/linear.pyr#   FakeQuantizedLinear.__init__C   s     		
 		

 	
 	$$%ST(->-J-J!.D* .2D* $-)?@@Z))8F F +55
)k.F!.K$&
4  *;)F)F})UD&)-D&    xc                     U R                   b  U R                  U5      nU R                  b  U R                  U R                  5      nOU R                  n[        R                  " XU R
                  5      $ N)r(   r-   weightFlinearr   )r.   r5   ws      r2   forwardFakeQuantizedLinear.forwardl   s\    ))5..q1A%%1**4;;7AAxxdii((r4   c                 x   [         R                  R                  U R                  U R                  U R
                  S LU R                  R                  U R                  R                  S9nU R                  R                  [         R                  " S5      :w  a"  U R                  Ul        U R
                  Ul        U$ )Ndevicedtypemeta)	r$   nnLinearr   r   r   r8   r@   rA   )r.   
new_linears     r2   	to_linearFakeQuantizedLinear.to_linearu   s    XX__IIT!;;%%++## % 

 ;;f!55 $J"iiJOr4   modc           
      T   [        UR                  UR                  UR                  S LUUUR                  R
                  UR                  R                  S9nUR                  R
                  [        R
                  " S5      :w  a"  UR                  Ul        UR                  Ul        U$ )Nr   r   r@   rA   rB   )r   r   r   r   r8   r@   rA   r$   )clsrH   r   r   rE   s        r2   from_linearFakeQuantizedLinear.from_linear   s     )OOHHD /'::$$**""

 ::V 44 #

J!hhJOr4   )r(   r-   )FNN)NN)__name__
__module____qualname____firstlineno____doc__intboolr   r   r#   r$   Tensorr<   rC   rD   rF   classmethodrL   __static_attributes____classcell__r1   s   @r2   r   r   *   s    8 >B:>'.'. '. 	'.
 $$:;'.   67'. 
'. '.R) )%,, )588??    ?C:>	XX__ $$:;   67	 r4   r   rH   enabledc                     [        U [        5      (       a<  U R                  b  XR                  l        U R                  b  XR                  l        ggg)zG
Helper function to enable fake quantization in `FakeQuantizedLinear`.
N)r)   r   r(   rZ   r-   )rH   rZ   s     r2   enable_linear_fake_quantr\      sL     #*++((44;))1$$007%%- 1 ,r4   c                     [        U SS9  g)zH
Helper function to disable fake quantization in `FakeQuantizedLinear`.
F)rZ   N)r\   rH   s    r2   disable_linear_fake_quantr_      s     S%0r4   c                   @    \ rS rSrSrS\\   4S jrS\\   4S jrSr	g)_LegacyQATQuantizer   zE
Base class for sharing common methods across legacy QAT quantizers.
r    c                     g r7    r.   s    r2   #get_activation_fake_quantize_config7_LegacyQATQuantizer.get_activation_fake_quantize_config       r4   c                     g r7   rd   re   s    r2   get_weight_fake_quantize_config3_LegacyQATQuantizer.get_weight_fake_quantize_config   rh   r4   rd   N)
rN   rO   rP   rQ   rR   r   r   rf   rj   rW   rd   r4   r2   ra   ra      s+    X>T5U :P1Q r4   ra   c                     ^  \ rS rSrSrSS\R                  \R                  4S\S\S\R                  S\R                  S	S
4
U 4S jjjr
S\R                  R                  S\S\S	\R                  R                  4S jrS\R                  R                  S\S\S	\R                  R                  4S jrS\R                  R                  4S jrS	\\   4S jrS	\\   4S jrSrU =r$ ) Int8DynActInt4WeightQATQuantizer   z
Quantizer for performing QAT on a model, where linear layers have int8
dynamic per token fake quantized activations and int4 fake quantized
grouped per channel weights.
   F	groupsizepadding_allowed	precisionscales_precisionr    Nc                    > [         TU ]  5         [        R                  R	                  S5        Xl        X l        X0l        X@l        [        R                  U l
        g )Nz9torchao.quantization.qat.Int8DynActInt4WeightQATQuantizer)r"   r#   r$   r%   r&   rp   rq   rr   rs   float32activation_scales_precision)r.   rp   rq   rr   rs   r1   s        r2   r#   )Int8DynActInt4WeightQATQuantizer.__init__   sI     	$$G	
 (%4&/-=+0==(r4   modelr/   r0   c           
      |    [        UU R                  U R                  U R                  U R                  [
        SS9  U$ )NT)copy_weights)r   rp   rq   rr   rs   Int8DynActInt4WeightQATLinearr.   rx   r/   r0   s       r2   prepare(Int8DynActInt4WeightQATQuantizer.prepare   s<     	NN  NN!!)	
 r4   c                 (    U R                  U5        U$ r7   )_convert_qat_linear_8da4wr|   s       r2   convert(Int8DynActInt4WeightQATQuantizer.convert   s     	&&u-r4   modulec           
         UR                  5        GHg  u  p#[        U[        5      (       Ga:  UR                  R                  n[        UR                  UR                  UR                  SLUR                  UR                  R                  UR                  S9n[        XU5        Sn[        U5      u  px[        UR                  UUR                  UR                  S9u  pU
R!                  UR"                  5      n
SSKJn  U" UR                  U	U
UU[(        R*                  UR                  5      nXl
        Xl        Xl        UR                  b  UR                  Ul        GMS  GMV  U R1                  U5        GMj     g)zP
Replace all `Int8DynActInt4WeightQATLinear` with `Int8DynActInt4WeightLinear`.
N)rp   rr   rs      )rr   r   )8_quantized_decomposed_quantize_per_channel_group_wrapper)named_childrenr)   r{   r-   configr   r   r   r   r+   r8   rA   scale_precisionsetattrr   r   tozero_point_precisiontorchao._executorch_opsr   r$   int8scaleszerosr   )r.   r   namechildr   quantized_linearn_bitqminqmaxszpr   q_weights                r2   r   :Int8DynActInt4WeightQATQuantizer._convert_qat_linear_8da4w   sT    "002KD%!>??44;;#=%%&&JJd*$//#ll00%+%;%;$  &67 -e45LL%%$44	 UU6667 TLLJJ%% +3'*+')+&::),1JJ$) * ..u5U 3r4   c                 ,    [        U R                  5      $ r7   )_get_8da4w_activation_configrv   re   s    r2   rf   DInt8DynActInt4WeightQATQuantizer.get_activation_fake_quantize_config&  s    +D,L,LMMr4   c                 B    [        U R                  U R                  5      $ r7   )_get_8da4w_weight_configrp   rs   re   s    r2   rj   @Int8DynActInt4WeightQATQuantizer.get_weight_fake_quantize_config)  s    '8M8MNNr4   )rv   rp   rq   rr   rs   )rN   rO   rP   rQ   rR   r$   ru   rS   rT   rA   r#   rC   Moduler   r}   r   r   r   r   rf   rj   rW   rX   rY   s   @r2   rm   rm      s     %!&(-99 9 ;;	9
  ++9 
9 9$XX__-0<?	XX__-0<?	.6 .6`NX>T5U NO:P1Q O Or4   rm   c                      ^  \ rS rSrSrSSS\R                  \R                  4S\S\S\S	\R                  S
\S\R                  S\R                  SS4U 4S jjjrSS\4S jjrS rSrU =r$ )r{   i-  a  
This module implements a linear layer with int8 dynamic per token fake
quantized activations with int4 fake quantized grouped per channel weights.

args:
    groupsize: the number of elements in each quantized group for weights
    precision: precision of weights
    scales_precision: precision of per group scales and zero points

Note: we hardcode activation scales to use torch.fp32, but allow users to specify the weight scales (defaults to torch.fp32).
To get an exact numerical match with Int8DynamicActivationInt4WeightConfig, users must use the same dtype for both the weights
and the scales. Here scales_precision refers specifically to the weight scales only, not the activation scales.
FNro   r   r   r   r@   rp   rr   rs   r    c           
      t   > [        [        R                  5      n[        XW5      n	[        T
U ]  UUUUU	UUS9  g )Nr?   )r   r$   ru   r   r"   r#   )r.   r   r   r   r@   rp   rr   rs   r   r   r1   s             r2   r#   &Int8DynActInt4WeightQATLinear.__init__<  sE     9G0M 	 	
r4   rZ   c                 D    XR                   l        XR                  l        g r7   r(   rZ   r-   r.   rZ   s     r2   enable_fake_quant/Int8DynActInt4WeightQATLinear.enable_fake_quantU      18&&.-4""*r4   c                 &    U R                  S5        g NFr   re   s    r2   disable_fake_quant0Int8DynActInt4WeightQATLinear.disable_fake_quantY      u%r4   rd   T)rN   rO   rP   rQ   rR   r$   ru   rS   rT   r@   rA   r#   r   r   rW   rX   rY   s   @r2   r{   r{   -  s    $ #!&(-

 
 	

 
 
 ;;
  ++
 

 
25 5& &r4   r{   c                 P    [        U [        5      (       a  U R                  5         gg)zL
(deprecated) Enable fake quantization for `Int8DynActInt4WeightQATLinear`.
N)r)   r{   r   r^   s    r2   enable_8da4w_fake_quantr   ^  s#     #455 6r4   c                 P    [        U [        5      (       a  U R                  5         gg)zM
(deprecated) Disable fake quantization for `Int8DynActInt4WeightQATLinear`.
N)r)   r{   r   r^   s    r2   disable_8da4w_fake_quantr   g  s#     #455  6r4   qparams_precisionr    c                     U [         R                  :X  d   e[        [         R                  SSSU U [         R                  " U 5      R
                  S9$ )zX
Return the activation `IntxFakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
	per_tokenFT)rA   r*   is_symmetric
is_dynamicr   r   eps)r$   ru   r   r   finfor   )r   s    r2   r   r   o  sL     ---!jj).KK)*.. r4   r+   c           	      :    [        [        R                  U SSUUS9$ )zT
Return the weight `IntxFakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
T)rA   r+   r   r   r   r   )r   r   INT4r+   r   s     r2   r   r     s)     "). r4   c                     ^  \ rS rSrSrSS\R                  \R                  4S\S\\   S\R                  S\R                  S	S
4
U 4S jjjr
S\R                  R                  S\S\S	\R                  R                  4S jrS\R                  R                  S\S\S	\R                  R                  4S jrS\R                  R                  4S jrS	\\   4S jrSrU =r$ )Int4WeightOnlyQATQuantizeri  zt
Quantizer for performing QAT on a model, where linear layers have
int4 fake quantized grouped per channel weights.
ro      rp   inner_k_tilesrr   rs   r    Nc                    > [         TU ]  5         [        R                  R	                  S5        US;   d   eUS;   d   eX l        Xl        X0l        X@l        g )Nz3torchao.quantization.qat.Int4WeightOnlyQATQuantizer)   r   r   )    @      ro   )	r"   r#   r$   r%   r&   r   rp   rr   rs   )r.   rp   r   rr   rs   r1   s        r2   r#   #Int4WeightOnlyQATQuantizer.__init__  sZ     	$$A	
 	)))....*"" 0r4   rx   r/   r0   c                 ~    [        UU R                  U R                  SU R                  U R                  [
        SS9  U$ )NT)rq   rr   rs   linear_classrz   )r   rp   r   rr   rs   Int4WeightOnlyQATLinearr|   s       r2   r}   "Int4WeightOnlyQATQuantizer.prepare  s?     	NN nn!220		
 r4   c                 (    U R                  U5        U$ r7   )_convert_qat_linear_4wr|   s       r2   r   "Int4WeightOnlyQATQuantizer.convert  s     	##E*r4   r   c                    UR                  5        GH  u  p#[        U[        5      (       Ga  UR                  nUR                  nUR
                  nUR                  R                  n[        UUSUR                  UUR                  R                  UR                  [        UR                  5       5      R                  S9n[!        XU5        Sn	[#        UR                  U	UR                  5      u  p[%        U
R                  R&                  S5      (       aX  [(        R*                  R,                  R/                  U
R1                  UR                  R                  5      UR
                  5      n
OW[(        R*                  R,                  R3                  U
R1                  UR                  R                  5      UR
                  5      n
Xl
        Xl        GM  U R7                  U5        GM     g)zD
Replace all `Int4WeightOnlyQATLinear` with `WeightOnlyInt4Linear`.
F)r   rp   r   rr   rs   r@   r   cpuN)r   r)   r   r   r   r   r-   r   r	   r+   r8   rA   r   next
parametersr@   r   r   r   typer$   opsaten#_convert_weight_to_int4pack_for_cpur   _convert_weight_to_int4packscales_and_zerosr   )r.   r   r   r   r   r   r   r   r   r   r   r   s               r2   r   1Int4WeightOnlyQATQuantizer._convert_qat_linear_4w  s    "002KD%!899#//$11 % 3 344;;#7 $//"/#ll00%+%;%; 0 0 23::	$  &67 /OLL%%0,
 X__11599$yy~~QQ ELL$7$78++ H
  %yy~~II ELL$7$78++ H +3'4D1++E2M 3r4   c                 B    [        U R                  U R                  5      $ r7   )_get_4w_weight_configrp   rs   re   s    r2   rj   :Int4WeightOnlyQATQuantizer.get_weight_fake_quantize_config  s    $T^^T5J5JKKr4   )rp   r   rr   rs   )rN   rO   rP   rQ   rR   r$   bfloat16rS   r   rA   r#   rC   r   r   r}   r   r   r   rj   rW   rX   rY   s   @r2   r   r     s     '(!&(-11  }1 ;;	1
  ++1 
1 1$XX__-0<?	XX__-0<?	*3UXX__ *3XL:P1Q L Lr4   r   c                      ^  \ rS rSrSrSSSS\R                  \R                  4S\S\S	\S
\R                  S\S\S\R                  S\R                  SS4U 4S jjjrSS\4S jjrS rSrU =r$ )r   i  a|  
This module implements a linear layer with int4 fake quantized grouped
per channel weights, with forward numerics matching `WeightOnlyInt4Linear`,
which uses the efficient int4 tinygemm kernel.

args:
    groupsize: the number of elements in each quantized group for weights
    precision: precision of weights
    scales_precision: precision of per group scales and zero points
FNro   r   r   r   r   r@   rp   r   rr   rs   r    c	           
         > U[         R                  :X  d   S5       e[        XU5      (       d  [        S5      eX`l        [        XX5      n	[        T
U ]  UUUS U	UUS9  g )Nz!only bf16 is supported for scalesz'Padding for QAT 4w is not supported yetrJ   )r$   r   r
   r,   r   r   r"   r#   )r.   r   r   r   r@   rp   r   rr   rs   r   r1   s             r2   r#    Int4WeightOnlyQATLinear.__init__  sm      5>>1V3VV1#KMJJFGG*-iJ"' 	 	
r4   rZ   c                 D    XR                   l        XR                  l        g r7   r   r   s     r2   r   )Int4WeightOnlyQATLinear.enable_fake_quant  r   r4   c                 &    U R                  S5        g r   r   re   s    r2   r   *Int4WeightOnlyQATLinear.disable_fake_quant  r   r4   )r   r   )rN   rO   rP   rQ   rR   r$   r   rS   rT   r@   rA   r#   r   r   rW   rX   rY   s   @r2   r   r     s    	 #!&(-

 
 	

 
 
 
 ;;
  ++
 

 
45 5& &r4   r   c                 P    [        U [        5      (       a  U R                  5         gg)zF
(deprecated) Enable fake quantization for `Int4WeightOnlyQATLinear`.
N)r)   r   r   r^   s    r2   enable_4w_fake_quantr   $  s#     #.// 0r4   c                 P    [        U [        5      (       a  U R                  5         gg)zG
(deprecated) Disable fake quantization for `Int4WeightOnlyQATLinear`.
N)r)   r   r   r^   s    r2   disable_4w_fake_quantr   -  s#     #.//  0r4   c           
      X    [        [        R                  U SSUU[        R                  S9$ )zN
Return the weight `IntxFakeQuantizeConfig` for `Int4WeightOnlyQATQuantizer`.
FT)rA   r+   r   r   r   r   zero_point_domain)r   r$   uint4r   FLOATr   s     r2   r   r   5  s0     "kk).)// r4   c                   \   \ rS rSrSrS\R                  4S\\   S\R                  4S jjr
S\R                  R                  S\S	\S
\R                  R                  4S jrS\R                  R                  S\S	\S
\R                  R                  4S jrS
\\   4S jrS
\\   4S jrSrg)Float8ActInt4WeightQATQuantizeriL  a  
QAT quantizer for applying dynamic rowwise float8 activation + int4
per group/channel symmetric weight fake quantization to linear layers
in the model. Currently only supports rowwise granularity for float8
activations.

args:
    group_size (Optional[int]): the number of elements in each quantized
        group for weights, defaults to 64. Use None for per channel.
    scale_precision: precision of weight scales, defaults to torch.bfloat16.
r   r+   r   c           	          [         R                  R                  S5        Ub  SnOSn[        [         R                  [        5       S9U l        [        [         R                  UUSSUS9U l	        g )Nz8torchao.quantization.qat.Float8ActInt4WeightQATQuantizer	per_groupper_channel)rA   r*   T)rA   r*   r+   r   r   r   )
r$   r%   r&   r   float8_e4m3fnr   _activation_configr   int4_weight_config)r.   r+   r   weight_granularitys       r2   r#   (Float8ActInt4WeightQATQuantizer.__init__Y  sm    
 	$$F	
 !!,!.":%%#
 5***!+
r4   rx   r/   r0   r    c                    UR                  5        Hv  u  pE[        U[        R                  R                  5      (       a7  [
        R                  UU R                  U R                  S9n[        XU5        Me  U R                  U5        Mx     U$ )z
Swap all `nn.Linear` with `FakeQuantizedLinear` with float8
fake quantizer for activations and int4 fake quantizer for weights.
)r   r   )r   r)   r$   rC   rD   r   rL   r   r   r   r}   )r.   rx   r/   r0   r   r   rE   s          r2   r}   'Float8ActInt4WeightQATQuantizer.preparer  sw     !//1KD%110<<&*&=&="&"5"5 = 

 Z0U# 2 r4   c                     [         er7   NotImplementedErrorr|   s       r2   r   'Float8ActInt4WeightQATQuantizer.convert  s
     "!r4   c                     [        S5      e)Nz,Float8 FakeQuantizeConfig does not exist yetr   re   s    r2   rf   CFloat8ActInt4WeightQATQuantizer.get_activation_fake_quantize_config  s    !"PQQr4   c                     U R                   $ r7   )r   re   s    r2   rj   ?Float8ActInt4WeightQATQuantizer.get_weight_fake_quantize_config  s    !!!r4   )r   r   N)rN   rO   rP   rQ   rR   r$   r   r   rS   rA   r#   rC   r   r   r}   r   r   rf   rj   rW   rd   r4   r2   r   r   L  s    
 %'',~~
SM
 
2XX__-0<?	("XX__"-0"<?"	"
RX>T5U R":P1Q "r4   r   r   )8typingr   r   r$   torch.nn.functionalrC   
functionalr9   torchao.dtypes.utilsr    torchao.quantization.granularityr   r   )torchao.quantization.linear_quant_modulesr   r	   r
   r   r   r   %torchao.quantization.quant_primitivesr   r   torchao.quantization.unifiedr   torchao.quantization.utilsr   fake_quantize_configr   r   r   fake_quantizerr   utilsr   rD   r   r   rT   r\   r_   ra   rm   r{   r   r   rA   r   rS   r   r   r   r   r   r   r   rd   r4   r2   <module>r     s   !    * =  : B 

q%((// ql 8	881588?? 1	* 	"aO': aOH-&$7 -&b   !%((// !{{&{{ ,ZL!4 ZLz+&1 +&^ ehhoo  !uxx !{{ .C"&9 C"r4   