
    ȅi                        S SK r S SKrS SKJr  S SKJrJr  S SKJrJ	r	  \	" SS5      r
\R                  \R                  \R                  \R                  \R                  /r\R"                  \R$                  /r\ V s0 s H9  o \R)                  U 5      R*                  \R)                  U 5      R,                  4_M;     sn r\R1                  \ V s0 s HE  o \" \R5                  U 5      R*                  5      \" \R5                  U 5      R,                  5      4_MG     sn 5        S r\
R9                  S5        \" \
S	S
5      S\R:                  S\S\S\S\S\R>                  S\R:                  4S j5       r \" \
S	S5      S\R:                  S\S\S\S\S\R>                  S\R:                  4S j5       r!\
R9                  S5        \" \
SS
5      S\R:                  S\R:                  S\R:                  S\S\S\R>                  S\R:                  4S j5       r"\" \
SS5      S\R:                  S\R:                  S\R:                  S\S\S\R>                  S\R:                  4S j5       r#\
R9                  S5        \" \
SS
5      S\R:                  S\R:                  S\R:                  S\R:                  S\R:                  S\R>                  S\R:                  4S j5       r$\" \
SS5      S\R:                  S\R:                  S\R:                  S\R:                  S\R:                  S\R>                  S\R:                  4S j5       r%\
R9                  S5        \" \
SS
5      SS.S\R:                  S\S\S\S\S\R>                  S \R>                  S-  S\R:                  4S! jj5       r&\" \
SS5      SS.S\R:                  S\R:                  S\R:                  S\S\S\R>                  S \R>                  S-  S\R:                  4S" jj5       r'\
R9                  S#5        \" \
S$S
5      SS.S\R:                  S\R:                  S\R:                  S\S\S\R>                  S \R>                  S-  S\R:                  4S% jj5       r(\" \
S$S5      SS.S\R:                  S\R:                  S\R:                  S\S\S\R>                  S \R>                  S-  S\R:                  4S& jj5       r)\
R9                  S'5        \" \
S(S
5      SS.S\R:                  S\R:                  S\R:                  S\R:                  S\R:                  S\R>                  S \R>                  S-  S\R:                  4S) jj5       r*\" \
S(S5      SS.S \R>                  S-  S\R:                  4S* jj5       r+\
R9                  S+5        \" \
S,S
5      S\R:                  S-\S.\S/\S\R>                  S\,\R:                  \R:                  4   4S0 j5       r-\
R9                  S15        \" \
S2S
5      S\R:                  S-\S.\S/\S\R>                  S\,\R:                  \R:                  4   4S3 j5       r.\" \
S,S5      S\R:                  S\S\S/\S\R>                  S\,\R:                  \R:                  4   4S4 j5       r/\" \
S2S5      S\R:                  S\S\S/\S\R>                  S\,\R:                  \R:                  4   4S5 j5       r0S6 r1\
R9                  S75        \" \
S8S
5      S\R:                  S9\R:                  S:\R:                  S;\S\S\S\R>                  S\R:                  4S< j5       r2\" \
S8S5      S\R:                  S9\R:                  S:\R:                  S;\S\S\S\R>                  S\R:                  4S= j5       r3\
R9                  S>5        \" \
S?S
5      SS.S\R:                  S9\R:                  S:\R:                  S-  S;\S\S\S\R>                  S \R>                  S-  S\R:                  4S@ jj5       r4\" \
S?S5      SS.S\R:                  S9\R:                  S:\R:                  S-  S;\S\S\S\R>                  S \R>                  S-  S\R:                  4SA jj5       r5\
R9                  SB5        \" \
SCS
5      S\R:                  S\R>                  S\,\R:                  \R:                  4   4SD j5       r6\" \
SCS5      S\R:                  S\R>                  S\,\R:                  \R:                  4   4SE j5       r7\
R9                  SF5        \" \
SGSH5      S\R:                  S\R>                  S\,\R:                  \R:                  4   4SI j5       r8\
R9                  SJ5        \" \
SKS
5      S\R:                  S\R>                  S\,\R:                  \R:                  4   4SL j5       r9\" \
SKS5      S\R:                  S\R>                  S\,\R:                  \R:                  4   4SM j5       r:SN r;\
R9                  SO5        \" \
SPS
5      S\R:                  S9\R:                  S:\R:                  S\S\S\R>                  4SQ j5       r<\" \
SPS5      S\R:                  S9\R:                  S:\R:                  S\S\S\R>                  4SR j5       r=\
R9                  SS5        \" \
STS
5      \R|                  4S\R:                  S9\R:                  S:\R:                  S\S\S\R>                  SU\R>                  4SV jj5       r?\" \
STS5      \R|                  4S\R:                  S9\R:                  S:\R:                  S\S\S\R>                  SU\R>                  4SW jj5       r@\
R9                  SX5        \" \
SYS
5       SmS\R:                  S9\R:                  S:\R:                  S\S\S\R>                  4S[ jj5       rA\" \
SYS5       SmS\R:                  S9\R:                  S:\R:                  S\S\S\R>                  4S\ jj5       rB\
R9                  S]5        \" \
S^S
5      SZ\R|                  4S_\R:                  S9\R:                  S:\R:                  S-  S\S\S\R>                  S`\SU\R>                  4Sa jj5       rC\
R9                  Sb5         " Sc Sd\R                  R                  5      rF\" \
SeSf5      S\R:                  S9\R:                  S:\R:                  S;\S\S\S\R:                  4Sg j5       rG\" \
SeS5      S\R:                  S9\R:                  S:\R:                  S;\S\S\S\R:                  4Sh j5       rH\
R9                  Si5        \" \
SjS
5      S\R:                  S\R>                  S\R:                  4Sk j5       rI\" \
SjS5      S\R:                  S\R>                  S\R:                  4Sl j5       rJgs  sn f s  sn f )n    N)_unsqueeze_multiple)determine_qparamsvalidate_qmin_qmax)implLibraryquantized_decomposedDEFc                     U[         ;  a  [        SU 35      e[         U   u  p4X:  a  [        SU SU  35      eX:  a  [        SU SU 35      eg )NzUnsupported dtype: z9quant_min out of bound for dtype, quant_min_lower_bound: z quant_min: z9quant_max out of bound for dtype, quant_max_upper_bound: z quant_max: )_DTYPE_TO_QVALUE_BOUNDS
ValueErrorAssertionError)	quant_min	quant_maxdtypequant_min_lower_boundquant_max_upper_bounds        ^/home/james-whalen/.local/lib/python3.13/site-packages/torch/ao/quantization/fx/_decomposed.py_quant_min_max_bounds_checkr      s    ++.ug6773J53Q0(&&;%<LU
 	

 (&&;%<LU
 	
 )    zxquantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tensorCompositeExplicitAutogradinputscale
zero_pointr   r   r   returnc                    U R                   [        R                  [        R                  4;   a  U R	                  [        R
                  5      n U R                   [        R
                  :w  a  [        SU R                    35      e[        X4U5        SU-  n[        R                  " [        R                  " X-  5      U-   X45      R	                  U5      $ )a  Affine quantization for the Tensor using the same quantization parameters to map
from floating point to quantized values

Args:
   input (torch.Tensor): original float32 or bfloat16 Tensor
   scale (float): quantization parameter for affine quantization
   zero_point (int): quantization parameter for affine quantization
   quant_min (int): minimum quantized value for output Tensor
   quant_max (int): maximum quantized value for output Tensor
   dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

Returns:
   Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
   are not stored in the Tensor, we are storing them in function arguments instead
<Expecting input to have dtype torch.float32, but got dtype:       ?)
r   torchfloat16bfloat16tofloat32r   r   clampround)r   r   r   r   r   r   	inv_scales          r   r   r   2   s    0 {{u}}enn55'{{emm#J5;;-X
 	
  	e<eI;;E%&3Ybir   Metac                 0   U R                   [        R                  [        R                  4;   a  U R	                  [        R
                  5      n U R                   [        R
                  :w  a  [        SU R                    35      e[        R                  " XS9$ )Nr   r   )r   r   r    r!   r"   r#   r   
empty_liker   r   r   r   r   r   s         r   quantize_per_tensor_metar,   X   sm     {{u}}enn55'{{emm#J5;;-X
 	
 E//r   zquantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensorc                    UR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      e[        U UR                  5       UR                  5       UUU5      $ zAffine quantization for the Tensor using the same quantization parameters to map
from floating point to quantized values
Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
scalar values
   >Expecting zero_point tensor to be one element, but received : 9Expecting scale tensor to be one element, but received : numelr   r   itemr+   s         r   quantize_per_tensor_tensorr5   p   s      QLZM]M]M_L`a
 	
 {{}GW
 	
 

 r   c                    U R                   [        R                  [        R                  4;   a  U R	                  [        R
                  5      n UR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      eU R                   [        R
                  :w  a  [        SU R                    35      e[        R                  " XS9$ )Nr/   r0   r1   r   r)   )	r   r   r    r!   r"   r#   r3   r   r*   r+   s         r   quantize_per_tensor_tensor_metar7      s     {{u}}enn55'QLZM]M]M_L`a
 	
 {{}GW
 	
 {{emm#J5;;-X
 	
 E//r   zquantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensor2c                 R   UR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      e[        U UR                  5       UR                  5       UR                  5       UR                  5       U5      $ r.   r2   r+   s         r   quantize_per_tensor_tensor2r9      s      QLZM]M]M_L`a
 	
 {{}GW
 	
 

 r   c                 "    [        U UUUUU5      $ N)r7   r+   s         r    quantize_per_tensor_tensor2_metar<      s#     + r   zdequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_tensor	out_dtyper?   c                    U R                   U:w  a  [        SU SU R                    35      eUc  [        R                  nU[        ;   a  U R                  U5      U-
  U-  $ [        SU 35      e)a  Affine dequantization for the Tensor using the same quantization parameters to map
from quantized values to floating point values

Args:
   input (torch.Tensor): Tensor with dtype matching `dtype` argument,
   e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
   quantization parameters in the argument of this function (scale/zero_point)

   scale (float): quantization parameter for affine quantization

   zero_point (int): quantization parameter for affine quantization

   quant_min (int): minimum quantized value for input Tensor (not used in computation,
   reserved for pattern matching)

   quant_max (int): maximum quantized value for input Tensor (not used in computation,
   reserved for pattern matching)

   dtype (torch.dtype): dtype for input Tensor (not used in computation,
   reserved for pattern matching)

   out_dtype (torch.dtype?): optional dtype for output Tensor

Returns:
   dequantized float32 Tensor
Expecting input to have dtype: 
, but got ,Unsupported dtype in dequantize_per_tensor: )r   r   r   r#   r   r"   r   r   r   r   r   r   r   r?   s          r   r=   r=      sz    J {{e-eWJu{{mL
 	
 MM	'' #j0E99GwOPPr   c                P    Uc  [         R                  n[         R                  " XS9$ Nr)   )r   r#   r*   rD   s          r   dequantize_per_tensor_metarG   &  s$     MM	E33r   zdequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensorc          
         UR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      e[        U UR                  5       UR                  5       UUUUS9$ zAffine dequantization for the Tensor using the same quantization parameters to map
from quantized values to floating point values
Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
scalar values
r/   r0   r1   r>   r3   r   r=   r4   rD   s          r   dequantize_per_tensor_tensorrK   <  s    ( QLZM]M]M_L`a
 	
 {{}GW
 	
 !

 r   c                   Uc  [         R                  nUR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      eU R                  U:w  a  [        SU SU R                   35      eU[
        ;   a  [         R                  " XS9$ [        SU 35      e)Nr/   r0   r1   rA   rB   r)   rC   )r   r#   r3   r   r   r   r*   r   rD   s          r   !dequantize_per_tensor_tensor_metarM   c  s     MM	QLZM]M]M_L`a
 	
 {{}GW
 	
 {{e-eWJu{{mL
 	
 ''77GwOPPr   zdequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensor2c          
      P   UR                  5       S:w  a  [        SUR                  5        35      eUR                  5       S:w  a  [        SUR                  5        35      e[        U UR                  5       UR                  5       UR                  5       UR                  5       UUS9$ rI   rJ   rD   s          r   dequantize_per_tensor_tensor2rO     s    ( QLZM]M]M_L`a
 	
 {{}GW
 	
 !

 r   c          
          [        XX#XEUS9$ )Nr>   )rM   rD   s          r   "dequantize_per_tensor_tensor2_metarQ     s     -jY r   zrchoose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams.tensorqminqmaxepsc           
         U R                   [        R                  [        R                  [        R                  4;  a  [        SU R                    35      eU[        ;  a#  [        S[        R                  5        SU 35      e[        X5        [        R                  " U 5      u  pV[        UUUUU[        R                  " U/5      SS9$ )3  Given an input Tensor, derive the per tensor affine quantization parameter
(scale and zero_point) for target quantized Tensor from the Tensor

Args:
   input (torch.Tensor): floating point input Tensor
   quant_min (int): minimum quantized value for target quantized Tensor
   quant_max (int): maximum quantized value for target quantized Tensor
   dtype (torch.dtype): dtype for target quantized Tensor

Returns:
   scale (float): quantization parameter for the target quantized Tensor
   zero_point (int): quantization parameter for the target quantized Tensor
CExpecting input to have dtype torch.float32/16/b16, but got dtype: $Expecting target dtype to be one of , but got: F)has_customized_qrange)r   r   r#   r    r!   r   r   keysr   aminmaxr   Tensorr   rR   rS   rT   r   min_valmax_vals          r   choose_qparams_tensorra     s    " {{ 
 QRWR]R]Q^_
 	
 ++23J3O3O3Q2RR]^c]de
 	
 t"}}U+GcU# r   z|choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams_symmetric.tensorc                    U R                   [        R                  [        R                  [        R                  4;  a  [        SU R                    35      eU[        ;  a#  [        S[        R                  5        SU 35      e[        X5        [        R                  " U 5      u  pV[        UUUUU[        R                  " U/5      S[        R                  S9$ )rV   rW   rX   rY   F)rZ   qscheme)r   r   r#   r    r!   r   r   r[   r   r\   r   r]   per_tensor_symmetricr^   s          r   choose_qparams_symmetric_tensorre     s    * {{ 
 QRWR]R]Q^_
 	
 ++23J3O3O3Q2RR]^c]de
 	
 t"}}U+GcU#**	 	r   c                    U R                   [        R                  [        R                  [        R                  4;  a  [        SU R                    35      eX:  a  [        SU SU 35      e[        R                  " S[        R                  U R                  S9[        R                  " S[        R                  U R                  S94$ )NrW   zCExpecting quant_min to be smaller than quant_max but received min: z max: r/   r   device)
r   r   r#   r    r!   r   emptydoublerh   int64r   r   r   rT   r   s        r   choose_qparams_tensor_metarm   (  s     {{ 
 QRWR]R]Q^_
 	
 QR[Q\\bclbmn
 	
 ;;qU\\BEKK	U\\E  r   c                     [         R                  " S[         R                  U R                  S9[         R                  " S[         R                  U R                  S94$ )Nr/   rg   )r   ri   rj   rh   rk   rl   s        r   $choose_qparams_symmetric_tensor_metaro   =  sA     ;;qU\\BEKK	U\\E  r   c                     [        [        U R                  5       5      5      nSX!'   XS'   U R                  [	        U5      5      nX24$ )Nr   )listrangedimpermutetuple)xaxisnew_axis_listys       r   _permute_to_axis_zerorz   G  sB    quuw(MM!			%&'Ar   zquantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_channelscaleszero_pointsrw   c                    U R                   [        R                  [        R                  4;   a  U R	                  [        R
                  5      n U R                   [        R
                  :w  a  [        SU R                    35      eX0R                  5       :  a  [        SU R                  5        35      e[        XEU5        [        X5      u  pS/U R                  5       -  nUR                  S   US'   UR                  U5      nUR                  U5      n[        R                  " [        R                  " U SU-  -  5      U-   XE5      n	U	R                  [        U5      5      n
U
R	                  U5      $ )a<  Affine per channel quantization for the Tensor using the same quantization
parameters for each channel/axis to map from floating point to quantized values

Args:
   input (torch.Tensor): original float32 or bfloat16 Tensor
   scales (torch.Tensor): a list of scale quantization parameter for
   affine quantization, one per channel
   zero_point (torch.Tensor): a list of zero_point quantization parameter for
   affine quantization, one per channel
   quant_min (int): minimum quantized value for output Tensor
   quant_max (int): maximum quantized value for output Tensor
   dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

Returns:
   Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
   are not stored in the Tensor, we are storing them in function arguments instead
r   Expecting axis to be < r/   r   r   )r   r   r    r!   r"   r#   r   rs   r   rz   shapeviewr$   r%   rt   ru   )r   r|   r}   rw   r   r   r   permute_axis_list	new_shaperesouts              r   r{   r{   U  s'   6 {{u}}enn55'{{emm#J5;;-X
 	
 yy{6uyy{mDEE	e<4UAEeiik!I<<?IaL[[#F""9-K
++ES6\*+k99C ++e-.
/C66%=r   c                    U R                   [        R                  [        R                  4;   a  U R	                  [        R
                  5      n U R                   [        R
                  :w  a  [        SU R                    35      eX0R                  5       :  a  [        SU R                  5        35      e[        XEU5        [        R                  " XS9$ )Nr   r   r)   )
r   r   r    r!   r"   r#   r   rs   r   r*   )r   r|   r}   rw   r   r   r   s          r   quantize_per_channel_metar     s     {{u}}enn55'{{emm#J5;;-X
 	
 yy{6uyy{mDEE	e<E//r   zdequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_channelc                   U R                   U:w  a  [        SU SU R                    35      eUc  [        R                  nX0R	                  5       :  a  [        SU R	                  5        35      e[        XEU5        [        X5      u  pS/U R	                  5       -  n	UR                  S   U	S'   UR                  U	5      nUb  XR                  U	5      -
  U-  n
OX-  n
U
R                  U5      n
U
R                  [        U5      5      nU$ )aO  Affine per channel dequantization for the Tensor using the same quantization
parameters for each channel/axis to map from quantized values to floating point values

Args:
   input (torch.Tensor): Tensor with dtype matching `dtype` argument,
   e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
   quantization parameter in the argument of this function (scales/zero_points/axis)

   scales (torch.Tensor): a list of scale quantization parameter for
   affine quantization, one per channel

   zero_points (torch.Tensor): a list of zero_point quantization parameter for
   affine quantization, one per channel

   quant_min (int): minimum quantized value for output Tensor (not used in computation,
   reserved for pattern matching)

   quant_max (int): maximum quantized value for output Tensor (not used in computation,
   reserved for pattern matching)

   dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
   reserved for pattern matching)

   out_dtype (torch.dtype?): optional dtype for output Tensor

Returns:
   dequantized float32 Tensor
rA   , but got dtype: r   r/   r   )r   r   r   r#   rs   r   rz   r   r   r"   rt   ru   )r   r|   r}   rw   r   r   r   r?   r   r   r   r   s               r   r   r     s    P {{e-eW4Eekk]S
 	
 MM	yy{6uyy{mDEE	e<4UAEeiik!I<<?IaL[[#F''	22f<n
&&
C
++e-.
/CJr   c                   U R                   U:w  a  [        SU SU R                    35      eUc  [        R                  nX0R	                  5       :  a  [        SU R	                  5        35      e[        XEU5        [        R                  " XS9$ )NzExpecting input to have dtype r   r   r)   )r   r   r   r#   rs   r   r*   )r   r|   r}   rw   r   r   r   r?   s           r   dequantize_per_channel_metar     s     {{e,UG3DU[[MR
 	
 MM	yy{6uyy{mDEE	e<E33r   zLchoose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)choose_qparams_per_tokenc                 h   U R                  5       R                  SSS9nUR                  [        R                  :X  a  UR                  5       nU[        R                  :X  a  SnSUS-
  -  S-
  nO[        SU 35      eUR                  SS	9R                  U5      n[        R                  " U5      nX%4$ )
  Choose quantization parameters for per token quantization. This means for a N dimension Tensor
(M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
every N elements with the same quantization parameter. The dimension for scales/zero_points
will be (M1 * M2 ... * Mn)

Args:
   input (torch.Tensor): original float32/float16 Tensor
   dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor

Returns:
    scales and zero_points, both float32 Tensors
Trs   keepdim      r/   z/unsupported dtype in choose_qparams_per_token: gh㈵>min)absamaxr   r   r    floatint8	Exceptionr$   div
zeros_like)r   r   r|   n_bitsr   r}   s         r   r   r     s    , YY["d3F||u}}$LLN 	 

&1*%)	=eWE
 	
 \\d\#''	2F""6*Kr   c                     [        U R                  S S 5      S/-   n[        R                  " U[        R                  U R
                  S9[        R                  " U[        R                  U R
                  S94$ Nr   r/   rg   rq   r   r   ri   rj   rh   rk   r   r   sizes      r   choose_qparams_per_token_metar   -  ]     CR !QC'D;;t5<<Eu{{EKKH  r   z]_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor))_choose_qparams_per_token_asymmetric_implCompositeImplicitAutogradc                    Su  p#[         R                  " U SSS9n[         R                  " U SSS9n[         R                  " U[         R                  " U5      5      n[         R
                  " U[         R                  " U5      5      n[         R                  " [         R                  5      R                  nXv-
  [        X2-
  5      -  n	U	R                  US9n	Xi-  n
Xy-  nX*-   nX;-   n[         R                  " X-   S:  X*-
  X;-
  5      n[         R                  " XU5      R                  5       nU	R                  [         R                  5      UR                  [         R                  5      4$ )r   )i   r   Tr   r   r   )r   aminr   r   r   maxfinfor#   rT   r   r$   wherer%   r"   float64rk   )r   r   rR   rS   r_   r`   min_val_negmax_val_posrT   r   descaled_mindescaled_maxzero_point_from_min_errorzero_point_from_max_errorr   s                  r   r   r   A  s,   , JDjjB5GjjB5G))GU%5%5g%>?K))GU%5%5g%>?K
++emm
$
(
(C &%*<<EKKCK E &L&L $ 3 $ 3!=AJ
 Zt4::<J88EMM"JMM%++$>>>r   zWchoose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)#choose_qparams_per_token_asymmetricc                     [        X5      $ r;   )r   r   r   s     r   r   r   v  s     5UBBr   c                     [        U R                  S S 5      S/-   n[        R                  " U[        R                  U R
                  S9[        R                  " U[        R                  U R
                  S94$ r   r   r   s      r   (choose_qparams_per_token_asymmetric_metar     r   r   c                 ,   [         R                  " [        U R                  5       5      S S 5      nX1R	                  5       :w  a  [        SU SUR                  5        35      eX2R	                  5       :w  a  [        SU SUR                  5        35      eg )Nr   znum_tokens: z	 scales: z zero_points: )mathprodrq   r   r3   r   )r   r|   r}   
num_tokenss       r   !_per_token_quant_qparam_dim_checkr     s    4

-cr23J\\^#|J<yPQQ&&((:,n[5E5E5G4HI
 	
 )r   z}quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tokenc                     [        X4U5        [        XU5        U R                  SU-  5      R                  U5      R	                  5       R                  X45      R                  U5      n U $ )a  Per token quantization for the Tensor using the quantization parameters to map
from floating point to quantized values. This means for a N dimension Tensor
(M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
every N elements with the same quantization parameter. The dimension for scales/zero_points
will be (M1 * M2 ... * Mn)

Args:
   input (torch.Tensor): original float32 or bfloat16 Tensor
   scales (float32 torch.Tensor): quantization parameter for per token affine quantization
   zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
   quant_min (int): minimum quantized value for output Tensor
   quant_max (int): maximum quantized value for output Tensor
   dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

Returns:
   Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
   are not stored in the Tensor, we are storing them in function arguments instead
r   )r   r   muladdr%   r$   r"   r   r|   r}   r   r   r   s         r   r   r     sX    6  	e<%e[A		#,	[			y	$	E 
 Lr   c                 B    [        X4U5        [        R                  " XS9$ rF   r   r   r*   r   s         r   quantize_per_token_metar     s      	e<E//r   zdequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensordequantize_per_tokenoutput_dtypec                 4    X-
  n X-  n U R                  U5      $ )a  Per token dequantization for the Tensor using the quantization parameters to map
from floating point to quantized values. This means for a N dimension Tensor
(M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
every N elements with the same quantization parameter. The dimension for scales/zero_points
will be (M1 * M2 ... * Mn)

Args:
   input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
   scales (float64 torch.Tensor): quantization parameter for per token affine quantization
   zero_points (int64 torch.Tensor): quantization parameter for per token affine quantization
   quant_min (int): minimum quantized value for input Tensor
   quant_max (int): maximum quantized value for input Tensor
   dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
   output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

Returns:
   dequantized Tensor with dtype `output_dtype`
)r"   r   r|   r}   r   r   r   r   s          r   r   r     s"    8 ENE88L!!r   c                 B    [        X4U5        [        R                  " XS9$ rF   r   r   s          r   dequantize_per_token_metar     s      	e<E66r   zquantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensorquantize_per_channel_group   c                    US::  a  [        S5      eX`R                  S   :  a"  UR                  S   S:X  a  U R                  S   nU R                  S   U-  S:w  a  [        S5      eU R                  5       S:w  a  [        S5      eU R                  SU5      n[        R
                  " U5      R                  5       S:w  a  [        S5      eUR                  SS5      nUR                  SS5      nUR                  S	U-  5      R                  U5      R                  5       R                  X45      R                  U5      R                  U 5      nU$ )
Nr/   group_size must be > 1r   r   /input.shape[-1] must be divisible by group_sizer   input must be 2-dimensionalzto_quant must not contain NaNsr   )r   r   rs   reshaper   isnansumr   r   r%   clamp_r"   
reshape_as)	r   r|   r}   r   r   r   
group_sizeto_quant
input_int8s	            r   r   r     s$    Q566KKO#R(8A(=[[_
{{2#q(NOOyy{a:;; }}R,H{{8  "a'=>>^^B"F%%b!,K 	S6\"	[					%	E	E	  r   c                 4   US::  a  [        S5      eX`R                  S   :  a"  UR                  S   S:X  a  U R                  S   nU R                  S   U-  S:w  a  [        S5      eU R                  5       S:w  a  [        S5      e[        R                  " XS9$ )	a  Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
to map from floating point to quantized values. This means for each row of a 2-d Tensor
(M, N), we calculate scales/zero_points for each `group_size` elements
and quantize every `group_size` elements with the same quantization parameter.
The dimension for scales/zero_points will be (M * ceil(N, group_size),)

Args:
   input (torch.Tensor): original float32 or bfloat16 Tensor
   scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
   zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
   quant_min (int): minimum quantized value for output Tensor
   quant_max (int): maximum quantized value for output Tensor
   dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

Returns:
   Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
   are not stored in the Tensor, we are storing them in function arguments instead
r/   r   r   r   r   r   r   r)   )r   r   rs   r   r*   )r   r|   r}   r   r   r   r   s          r   quantize_per_channel_group_metar   >  s    8 Q566KKO#R(8A(=[[_
{{2#q(NOOyy{a:;;E//r   zdequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensordequantize_per_channel_groupw_int8r   c                 \   US::  a  [        S5      eX`R                  S   :  a"  UR                  S   S:X  a  U R                  S   nU R                  S   U-  S:w  a  [        S5      eU R                  5       S:w  a  [        S5      eU R                  SU5      nUR                  SS5      nUb  UR                  SS5      n	O.[        R
                  " / [        R                  UR                  S9n	UR                  U	5      R                  U5      R                  U 5      R                  U5      n
U
$ )	a  Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
to map from floating point to quantized values. This means for each row of a 2-d Tensor
(M, N), we calculate scales/zero_points for each `group_size` elements
and quantize every `group_size` elements with the same quantization parameter.
The dimension for scales/zero_points will be (M * ceil(N, group_size),)

Args:
   input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
   scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
   zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
   quant_min (int): minimum quantized value for input Tensor
   quant_max (int): maximum quantized value for input Tensor
   dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
   output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

Returns:
   dequantized Tensor with dtype `output_dtype`
r/   r   r   r   z0w_int8.shape[-1] must be divisible by group_sizer   zw_int8 must be 2-dimensionalrg   )r   r   rs   r   r   zerosint32rh   subr   r   r"   )r   r|   r}   r   r   r   r   r   w_int8_groupedzpw_dqs              r   r   r   m  s	   D Q566LL$$b)9Q)>\\"%
||B*$)OPPzz|q;<<^^B
3N^^B"F  Q'[[5;;v}}Eb!%%f-88@CCLQDKr   zyfake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensorc                   4    \ rS rSr\S 5       r\S 5       rSrg)FakeQuantPerChanneli  c                 .   UR                   [        R                  :w  a  UR                  [        R                  5      nUR                   [        R                  :w  a  UR                  [        R                  5      nUR                   [        R                  :w  a  [        SUR                    35      eXAR                  5       :  a  [        SUR                  5        35      e[        [        U5      5      [        [        US-   UR                  5      5      -   n[        X'5      n[        X75      n	[        R                  " USU-  -  5      U	-   n
[        R                  " XU5      U	-
  U-  n[        R                  " X:  X:*  5      nU R                  U5        U$ )Nr   r   r/   r   )r   r   r#   r"   r   r   rs   rq   rr   ndimr   r%   r$   logical_andsave_for_backward)ctxr   r|   r}   rw   r   r   broadcast_dimsunsqueeze_scalesunsqueeze_zero_pointstempr   masks                r   forwardFakeQuantPerChannel.forward  sC    <<5==(YYu}}-F+%..5K;;%--' Nu{{m\  99; #:599;-!HIIeDk*T%q%**2M-NN.vF 3K P{{5C*:$:;<?TTKK36KK   $"3t7HJd#
r   c                 2    U R                   u  nX-  S S S S S 4$ r;   )saved_tensors)r   gyr   s      r   backwardFakeQuantPerChannel.backward  s&     ##y$dD$66r    N)__name__
__module____qualname____firstlineno__staticmethodr   r   __static_attributes__r   r   r   r   r     s(     . 7 7r   r   fake_quant_per_channelAutogradc                 0    [         R                  XX#XE5      $ r;   )r   applyr   r|   r}   rw   r   r   s         r   r   r     s     $${) r   c                 .    [         R                  " U 5      $ r;   r   r*   r  s         r   fake_quant_per_channel_metar    s     E""r   zFconvert_element_type.no_fuse(Tensor input, ScalarType dtype) -> Tensorzconvert_element_type.no_fusec                 h    [         R                  R                  R                  R	                  X5      $ r;   )r   opsprimsconvert_element_typedefaultr   s     r   r  r    s#     99??//77EEr   c                 *    [         R                  " XS9$ rF   r  r   s     r   convert_element_type_metar    s    E//r   )r   )Kr   r   torch._refsr   torch.ao.quantization.utilsr   r   torch.libraryr   r   quantized_decomposed_libuint8r   uint16int16r   _INTEGER_DTYPESfloat8_e5m2float8_e4m3fn_FLOAT_DTYPESiinfor   r   r   updateintr   r   definer]   r   r   r   r,   r5   r7   r9   r<   r=   rG   rK   rM   rO   rQ   ru   ra   re   rm   ro   rz   r{   r   r   r   r   r   r   r   r   r   r   r   r#   r   r   r   r   r   autogradFunctionr   r   r  r  r  )ks   0r   <module>r!     s     + M '
 ##95A ;;

ELL%++u{{S""E$7$78 :I9HAAEKKN..//    DQRMqU[[^	 #ekk!n&8&8"9::MR
$   @  57RS"<<"" " 	"
 " ;;" \\" T"J  5v>0<<00 0 	0
 0 ;;0 \\0 ?0"   @ :<W<<<<  	
  ;; \\>  <fE0<<0<<0 0 	0
 0 ;;0 \\0 F04   F ;=X<<<<  ||	
 || ;; \\>  =vF<<<<  ||	
 || ;; \\ G,   _  79TU %)0Q<<0Q0Q 0Q 	0Q
 0Q ;;0Q {{T!0Q \\0Q V0Qf  7@ %)4<<4<<4 4 	4
 4 ;;4 {{T!4 \\4 A4   _ " %)<<<<  	
  ;; {{T! \\
D  >G %)Q<<Q<<Q Q 	Q
 Q ;;Q {{T!Q \\Q HQ>   e # %)<<<<  ||	
 || ;; {{T! \\
D  ?H %) {{T! \\ I   7  79TU(<<("(*-(49(BG++(
5<<%&( V(V   7 %
(<<("(*-(49(BG++(
5<<%&(
(V  7@<<$'47>CLQKK
5<<%& A(  A6J<<$'47>CLQKK
5<<%& K   @  68ST.<<.LL. . 	.
 . . ;;. \\. U.b  6?0<<0LL0 0 	0
 0 0 ;;0 \\0 @02   _  8:UV %)=<<=LL= $= 	=
 = = ;;= {{T!= \\= W=@  8&A %)4<<4LL4 $4 	4
 4 4 ;;4 {{T!4 \\4 B4.   R
 
 << ;;  5<<%& 
 F 

<<;; 5<<%&
   c
 /
(?<<(?;;(? 5<<%&(?
(?V   ]
 )
C<<C;;C 5<<%&C
C )

<<;; 5<<%&

   @  46QR#<<#LL# # 	#
 # ;;# S#L  4f=	0<<	0LL	0 	0 		0
 	0 ;;	0 >	0   Y  68ST !&"<<"LL" " 	"
 " ;;" ++" U"B  6? !&7<<7LL7 7 	7
 7 ;;7 ++7 @7   A :<W %<<%LL% % 	%
 % ;;%%P  <fE %0<<%0LL%0 %0 	%0
 %0 ;;%0 F%0P   Z "  %.LL.LL. $. 	.
 . ;;. . ++.
.b   .7%..11 7B  8*E
<<
LL
 
 	

 
 
 \\
 F
  8&A#<<#LL# # 	#
 # # \\# B#   L
 "
F FU[[ FU\\ F
F  >G0U\\ 0%++ 0%,, 0 H0E' Ss   =A w"Aw'