
    ho                     \    S SK r SSKJr  S/r " S S\ R                  R
                  5      rg)    N   ) dynamically_quantize_per_channelWeightOnlyInt8QuantLinearc                      ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jr\	S\R                  R                  4S j5       rS	rU =r$ )
r      a  
This class is a replacement for `torch.nn.Linear`. It implements a
mixed dtype matrix multiplication using int8 symmetric per-channel weight quantization.

The primary goal of this class is to leverage int8 quantization for weights to reduce the
memory footprint and computational requirements while performing linear transformations.
This can be particularly beneficial for deploying models in low latency environments

Attributes:
    w_int8 (torch.Tensor): The quantized weights in int8 format.
    scales (torch.Tensor): The scaling factors for each channel to convert the quantized
                           weights back to floating point format during the forward pass.
c                    > UR                  S5      nUR                  S5      n[        TU ]  " U0 UD6  U R                  SU5        U R                  SU5        g)a  
Initializes the WeightOnlyInt8QuantLinear module.

Args:
    *args: Variable length argument list for `torch.nn.Linear`.
    **kwargs: Arbitrary keyword arguments.
              Must include 'w_int8' (int8 quantized weights) and 'scales' (scaling factors).
w_int8scalesN)popsuper__init__register_buffer)selfargskwargsr	   r
   	__class__s        Z/home/james-whalen/.local/lib/python3.13/site-packages/torchao/quantization/weight_only.pyr   "WeightOnlyInt8QuantLinear.__init__   sS     H%H%$)&)Xv.Xv.    xreturnc                 H   UR                  SUR                  S   5      n[        R                  " X@R                  R                  UR                  5      5      U R                  -  nUR                  " / UR                  SS QSP76 nU R                  b  XPR                  -  nU$ )a  
Performs the forward pass of the quantized linear layer, which consists of
mixed dtype matrix multiplication using int8 symmetric per-channel weight quantization.

Args:
    x (torch.Tensor): The input floating point tensor to the quantized linear layer.
    *args: Additional positional arguments.
    **kwargs: Additional keyword arguments.

Returns:
    torch.Tensor: The output floating point tensor after the quantized matrix multiplication
                  and rescale.
N)
viewshapetorchmmr	   todtyper
   reshapebias)r   r   r   r   x_viewys         r   forward!WeightOnlyInt8QuantLinear.forward-   s~     AGGBK(HHV[[^^AGG45CII(qwws|(R(99 NAr   modc                    UR                   n[        USS[        R                  5      u  p4nSu  pgU " UUUR                  SLUR                  5       R                  5       US9nUR                  Ul        UR                  Ul        U? UR                  Ul        [        UR                  5       5      R                  n	UR                  U	5        U$ )a  
Converts a `torch.nn.Linear` module to a `WeightOnlyInt8QuantLinear` module.

This method performs the conversion by dynamically quantizing the weights of the original
floating point linear layer to int8 format and creating a new `WeightOnlyInt8QuantLinear`
instance with these quantized weights and the corresponding scaling factors.

Args:
    mod (torch.nn.Linear): The original `torch.nn.Linear` module to convert.

Returns:
    WeightOnlyInt8QuantLinear: The converted quantized linear module with int8 weights.
i   )   r)   N)r!   r	   r
   )weightr   r   int8r!   t
contiguousin_featuresout_featuresnext
parametersdevicer   )
clsr&   w_fp32r	   r
   _zpfake_in_featuresfake_out_featuresnew_moddevice_to_uses
             r   
from_float$WeightOnlyInt8QuantLinear.from_floatB   s     >D#uzz
 /3+%88:((*
 "oo"//NxxS^^-.55

=!r    )__name__
__module____qualname____firstlineno____doc__r   r   Tensorr$   classmethodnnLinearr:   __static_attributes____classcell__)r   s   @r   r   r      sK    /  5<< * !UXX__ ! !r   )r   utilsr   __all__rD   rE   r   r<   r   r   <module>rJ      s+     3&
'V Vr   