
    h'                     l   S r SSKrSSKJs  Jr  SSKJrJr  / SQr	SS jr
 " S S\R                  R                  5      r " S	 S
\\R                  R                  5      r\R                  R                  \\R                  R                  R                   R"                  \0r S SS jjrSSS jjrS rg)a(  
Testing out accuracy-only implementation of SmoothQuant
(https://arxiv.org/pdf/2211.10438.pdf)
Note: this is an application of input-weight equalization, with the addition that the
multiplication by scale is fused into the preceding layer, specifically for relevant
parts of transformer blocks.
    N   )$_quant_int8_dynamic_per_token_linear dynamically_quantize_per_channel)	get_scaleSmoothFakeDynQuantMixin$SmoothFakeDynamicallyQuantizedLinear!swap_linear_with_smooth_fq_linearsmooth_fq_linear_to_inferenceset_smooth_fq_attributec                     [         R                  " X5      n[         R                  " USU-
  5      nX4-  nUR                  S5      $ )a  
Calculate the scale based on abs(max(X)), abs(max(W)), and alpha.

Args:
    X_absmax (torch.Tensor): Absolute maximum values of the input tensor X.
    W_absmax (torch.Tensor): Absolute maximum values of the weight tensor W.
    alpha (float, optional): Scaling factor. Defaults to 0.5.

Returns:
    torch.Tensor: The calculated scale of dimension `k` if X is of dimension `b*n*k` and W is of dimension `k*m`.
g      ?)torchpowreshape)X_absmaxW_absmaxalphaX_powW_powdivs         Z/home/james-whalen/.local/lib/python3.13/site-packages/torchao/quantization/smoothquant.pyr   r   !   s;     IIh&EIIhe,E
-C;;r?    c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
r   3   c                 l    SU l         S U l        U R                  SS 5        Xl        SU l        SU l        g )NTsmooth_scaleF)calibratingx_running_abs_maxregister_bufferr   debug_skip_scalingstore_w_int_repr_t)selfr   s     r   init_smoothquant_variables2SmoothFakeDynQuantMixin.init_smoothquant_variables4   s9    !%^T2
"' #(r   c                    [        [        [        UR                  5      S-
  5      5      n[        R
                  " [        R                  " U5      US9nU R                  c  X0l        g [        R                  " X0R                  5      U l        g )Nr   dim)	tuplerangelenshaper   amaxabsr   max)r"   Xall_dims_except_lastcur_abs_maxs       r   update_x_running_abs_max0SmoothFakeDynQuantMixin.update_x_running_abs_maxH   sa    $U3qww<!+;%<=jj13GH!!)%0"%*YY{<R<R%SD"r   c                 t   U R                   c   S5       eU R                  nU R                  (       dU  [        R                  " [        R
                  " U R                   5      UR                  SS5      5      R                  SS5      n[        USS[        R                  5      u  p#nUR                  5       nX#U4$ )Nz5self.smooth_scale is None, did you turn on inference?r   r   i   )
r   weightr    r   matmuldiag	transposer   int8
contiguous)r"   W
W_int_reprW_scalesW_zpss        r   get_scaled_quantized_w.SmoothFakeDynQuantMixin.get_scaled_quantized_wQ   s      , 	
C	
, KK
 &&

4,,-q{{1a/@i1o  'GtS%**'
#
e  **,
U**r   c                     [        5       eNNotImplementedErrorr"   s    r   to_inference$SmoothFakeDynQuantMixin.to_inferencej   s    !##r   c                     U R                  5       u  ol        nU R                  (       a3  U R                  SUR	                  SS5      R                  5       5        U ?g U R                  SUR                  5       5        U ?g )Nr=   r   r   )r@   r>   r!   r   r9   r;   r6   )r"   r=   _W_zpss      r   fold_weight#SmoothFakeDynQuantMixin.fold_weightm   sq     -1,G,G,I)
M6 ""  z/C/CAq/I/T/T/VW K   z/D/D/FGKr   c                     [        5       e)z
Sets `self.x_running_abs_max` to a value which will lead to smooth scale
of all ones if `alpha=0.5`, to enable performance benchmarking without
calibration.
rD   rF   s    r   set_debug_x_absmax*SmoothFakeDynQuantMixin.set_debug_x_absmaxy   s     "##r   )r>   r   r   r    r!   r   N)__name__
__module____qualname____firstlineno__r#   r2   r@   rG   rK   rN   __static_attributes__ r   r   r   r   3   s!    ((T+2$
$r   r   c                   R   ^  \ rS rSrSrU 4S jrS r\S	S j5       rS r	S r
SrU =r$ )
r      z
This is a replacement for `torch.nn.Linear` which implements dynamic per-token
activation quantization and dynamic per-channel weight quantization based on
Smoothquant scaling.
c                 j   > UR                  S5      n[        TU ]  " U0 UD6  U R                  U5        g )Nr   )popsuper__init__r#   )r"   argskwargsr   	__class__s       r   r[   -SmoothFakeDynamicallyQuantizedLinear.__init__   s1    

7#$)&)''.r   c                    U R                   (       a>  U R                  U5        [        R                  " XR                  U R
                  5      nU$ U R                  (       d  XR                  -  nU R                  (       a  U R                  OU R                  R                  5       n[        XU R                  U R
                  UR                  5      nU$ rC   )r   r2   Flinearr6   biasr    r   r!   r=   tr   r>   dtype)r"   r/   r\   r]   YW_int_repr_ts         r   forward,SmoothFakeDynamicallyQuantizedLinear.forward   s    ))!,KK3A  ** )))#'#:#:@Q@Q@S  5		177A r   c                 &   Su  p4U " X4UR                   SLUS9nUR                  Ul        UR                  Ul        UR                  Ul        UR                   Ul         [	        UR                  5       5      R                  nUR                  U5        U$ )zv
Converts a `mod` of class `torch.nn.Linear` to the smooth fake quantized
version of it.  Note: requires calibration.
)   rk   N)rc   r   )rc   in_featuresout_featuresr6   next
parametersdeviceto)clsmodr   fake_in_featuresfake_out_featuresnew_moddevice_to_uses          r   
from_float/SmoothFakeDynamicallyQuantizedLinear.from_float   s     /3+chhd6JRW
 "oo"//xxS^^-.55

=!r   c                 8   U R                   c   S5       eSU l        [        U R                   [        R                  " [        R
                  " U R                  R                  SS5      5      SS9R                  U R                  S9U l
        U R                  5         g)zT
Calculates the smoothquant scale based on calibration
in preparation for inference
Nzno calibration data foundFr   r   r&   r   )r   r   r   r   r.   r-   r6   r9   valuesr   r   rK   rF   s    r   rG   1SmoothFakeDynamicallyQuantizedLinear.to_inference   s}    
 %%1N3NN1 %""IIeii 5 5a ;<!DKK**

 	r   c                     [         R                  " [         R                  " U R                  R	                  SS5      5      SS9R
                  nXl        g )Nr   r   r&   )r   r.   r-   r6   r9   r|   r   )r"   w_absmaxs     r   rN   7SmoothFakeDynamicallyQuantizedLinear.set_debug_x_absmax   s8    99UYYt{{'<'<Q'BCKRR!)r   )r   r   r         ?)rP   rQ   rR   rS   __doc__r[   rh   classmethodrx   rG   rN   rT   __classcell__)r^   s   @r   r   r      s5    /
$  &* *r   r   c                 P   [        U R                  5       5      nUR                  5        Hy  u  pVUS:X  a  UnOU SU 3nUb  Xq;  aP  [        U5      [        R                  5       ;   a/  [        [        U5         nUR                  XcS9n	[        XU	5        Mm  [        XaXs5        M{     g)a  
Replaces linear layers in the model with their SmoothFakeDynamicallyQuantizedLinear equivalents.

Args:
    model (torch.nn.Module): The model containing linear layers to be replaced.
    skip_fqn_list (list of str, optional): List of fully qualified names to skip during replacement. Defaults to None.
    cur_fqn (str, optional): The current fully qualified name of the module being processed. Defaults to "".
    alpha (float, optional): The scaling factor for SmoothQuant. Defaults to 0.5.

Returns:
    None
 .Nr{   )	dictnamed_childrenitemstypesource_cls_to_target_clskeysrx   setattrr	   )
modelskip_fqn_listcur_fqnr   name_to_childnamechildnew_fqn
target_cls	new_childs
             r   r	   r	      s      --/0M$**,b=G 	4&)G"(DK388::1$u+>J"--e-AIE+-eGS -r   c                     U R                  5        HZ  u  p#[        U[        [        R	                  5       5      5      (       d  M3  U(       a  UR                  5         UR                  5         M\     g)a  
Prepares the model for inference by calculating the smoothquant scale for each SmoothFakeDynamicallyQuantizedLinear layer.

Args:
    model (torch.nn.Module): The model containing SmoothFakeDynamicallyQuantizedLinear layers.
    debug_skip_calibration (bool, optional): If True, sets the running maximum of activations to a debug value for performance benchmarking.
                                             Defaults to False.

Returns:
    None
N)named_modules
isinstancer(   r   r|   rN   rG   )r   debug_skip_calibration_rs   s       r   r
   r
      sP     %%'c5!9!@!@!BCDD%&&(	 (r   c                     U R                  5        HQ  u  p4[        U[        [        R	                  5       5      5      (       d  M3  [        XA5      (       d  ME  [        XAU5        MS     g rC   )r   r   r(   r   r|   hasattrr   )r   attribute_namenew_attribute_valr   rs   s        r   r   r     sI    %%'c5!9!@!@!BCDDs++->? (r   r   )Nr   r   )returnN)F)r   r   torch.nn.functionalnn
functionalra   utilsr   r   __all__r   Moduler   Linearr   modulesrb   NonDynamicallyQuantizableLinearr   r	   r
   r   rU   r   r   <module>r      s      
$L$ehhoo L$^B*+BEHHOO B*T 
HHOO9	HH;;=a  25T	T@*@r   