
    hP!                     <   S r SSKJrJrJrJrJr  SSKrSSKJ	r	J
r
  SSKJr  SSKJrJr  SSKJrJr  \R&                  r " S S	\5      rS
\S\S\S\\\4   4S jrS\R&                  S\\S4   4S jr   S(S
\S\S\S\S\R0                  S\\   S\\   S\S\4S jjrS\R&                  S\\   S\S\S\S\S\R&                  4S  jrS!\R&                  S\4S" jrS!\R&                  S\4S# jrS$\\\\\\4   \\   4      S\\\4   4S% jrS&\\\4   SS4S' jr g))z;
Defines an nn module designed to be used during inference
    )List
NamedTupleOptionalTupleUnionN)is_row_majorpad_tensor_for_matmul)FP8Granularity)PerRow	PerTensor)is_MI300is_sm_at_least_89c                   D    \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   Sr
g)	Float8MMConfig   a{  
Configuration for the scaled_mm in the forward and backward pass.

Attributes:
    emulate (bool): Whether to emulate the matmuls in fp32.
    use_fast_accum (bool): Whether to use the fast-accumulation option for scaled_mm.
    pad_inner_dim (bool): Whether to pad the inner dimension of a and b with 0s.
                          This is needed for matmuls not aligned to 16.
Femulateuse_fast_accumpad_inner_dim N)__name__
__module____qualname____firstlineno____doc__r   bool__annotations__r   r   __static_attributes__r       R/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/inference.pyr   r      s'     GT ND M4r   r   a_datab_datascaled_mm_configreturnc                    UR                   (       ac  U R                  S5      UR                  S5      :X  d+   SU R                  S5       SUR                  S5       35       e[        U SS9n [        USS9n[        U R	                  5       5      (       d  U R                  5       n [        UR	                  5       5      (       a,  UR                  5       R                  5       R                  5       nX4$ )zPreprocess the inner fp8 data tensors for admmm
Args:
    a_data: Input tensor A.
    b_data: Input tensor B.
    scaled_mm_config: Configuration for _scaled_mm.
Returns:
    Preprocessed tensors A and B in the format for _scaled_mm.
   r   z"Inner dims must match for mm, got z and )dims)r   sizer	   r   stride
contiguoust)r    r!   r"   s      r   preprocess_datar+   ,   s     %%{{1~Q/ 	
0Q0@fkkRSnEUV	
/ 'vA6&vA6((""$FMMO$$&&(**,>r   input_scaleinput_shape.c                     U R                  5       S:X  a  U R                  SS5      $ U R                  S5      n U R                  5       S:  a  U R                  SU R                  S   5      n U $ )z:Ensures input tensor is correctly formatted for _scaled_mmr%      )numelreshape	unsqueezedimshape)r,   r-   s     r   preprocess_scaler6   F   sn     a""1a(( ''+K 1!))"k.?.?.CDr   a_scaleb_scaleoutput_dtypeoutput_scalebiasr   c                     U[         R                  :X  a!  Ub  [         R                  " U UUUUUUS9nX-   $ [         R                  " U UUUUUUUS9$ )z
This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
as inputs. This is used to standardize the logic between subclassed and non subclassed
versions of the linear module.
)scale_ascale_bscale_result	out_dtyper   )r=   r>   r;   r?   r@   r   )torchfloat32
_scaled_mm)	r    r7   r!   r8   r9   r:   r;   r   outputs	            r    addmm_float8_unwrapped_inferencerE   X   so      u}}$)9!!%")
 }!%	 	r   scale
data_shaper4   startendstepc                   ^ ^ [         R                  R                  nT R                  T:X  a  UR                  R                  T X#XE5      $ [        UU 4S j[        [        T5      5       5       5      nU[        U5      :  a  T $ Xr   nUS:X  a  UR                  R                  T X#XE5      $ Ub  X8-  OSn	Ub
  XH-   S-
  U-  OSn
US:  a  [        S5      eUR                  R                  T X)U
S5      $ )z
Slice the scale tensor appropriately based on the data tensor slicing.
This function calculates how the scale should be sliced when the data tensor
is sliced along a given dimension, taking into account the block structure.
c              3   N   >#    U  H  nTU   TR                   U   -  v   M     g 7f)N)r5   ).0irG   rF   s     r   	<genexpr>-_slice_scale_for_dimension.<locals>.<genexpr>   s$     XAWA
1Q7AWs   "%r%   Nz;Slicing with step > 1 is not implemented for scale tensors.)
rA   opsatenr5   sliceTensortuplerangelenNotImplementedError)rF   rG   r4   rH   rI   rJ   rR   block_sizesblock_size_for_dimscale_start	scale_ends   ``         r   _slice_scale_for_dimensionr]      s     99>>D {{j zz  C>> Xs:AWXXK
c+$)Q zz  C>> 6;5Fe1D  %).@@ 	 !8%M  zz  )QGGr   xc                     [        U S5      (       d   S5       e[        U R                  5      SU R                  5       S-
  -  U R                  S   4-   :H  $ )rChecks if a quantized tensor is rowwise scaled
Args:
    x: quantized tensor (should have `block_size` attribute)

block_size.Expecting input to have `block_size` attribute)r%   r%   r/   )hasattrrU   ra   r4   r5   r^   s    r   _is_rowwise_scaledre      sO    
 1l##U%UU#$!%%'A+"6!''"+"GGGr   c                    ^  [        T S5      (       d   S5       e[        U 4S j[        T R                  5       5       5      $ )r`   ra   rb   c              3      >#    U  H=  nTR                   U   S :H  =(       d    TR                   U   TR                  U   :H  v   M?     g7f)r/   N)ra   r5   )rM   rN   r^   s     r   rO   (_is_tensorwise_scaled.<locals>.<genexpr>   s>      HU1Q2>aAGGAJ!>>s   AA)rc   allrV   ndimrd   s   `r   _is_tensorwise_scaledrk      s@    
 1l##U%UU# HMaff  r   granularityc                    S nU c  [        5       [        5       4nU$ [        U [         [        45      (       a  X 4nU$ [        U [        [        45      (       a  [        U 5      S:X  a  [        U S   [         [        45      (       a  [        U S   [         [        45      (       d  [        SU  S35      e[        U S   [        U S   5      5      (       d  [        SU  S35      e[        U 5      nU$ [        SU  S35      e)Nr0   r   r%   zInvalid granularity types: ), only PerTensor or PerRow are supported.zEDifferent granularities for activation and weight are not supported: z#Invalid granularity specification: )r   
isinstancer   rU   listrW   
ValueErrortype)rl   processed_granularitys     r   _normalize_granularityrt      s!    !!*ik :( ! ' 
K)V!4	5	5!, :$ ! # 
K%	/	/C4D4I{1~	6':;;;q>Iv+>??-k]:cd  +a.${1~*>??WXcWd  eN  O  !&k 2
 !  1+>gh
 	
r   granularitiesc                     U  HT  n[        U[        [        45      (       d  [        SU S35      e[	        5       (       a  M>  [        5       (       a  MO   S5       e   g)a  
Validate that the hardware supports the requested granularities.

Args:
    granularities: Tuple of (activation_granularity, weight_granularity)

Raises:
    AssertionError: If hardware doesn't support the requested granularity
    ValueError: If invalid granularity type is provided
zInvalid granularity type: rn   uN   Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+.N)ro   r   r   rq   r   r   )ru   _granularitys     r   _check_hardware_supportrx      s[     &,F(;<<,\N:cd  !""hjj 	
\	
0 &r   )NNF)!r   typingr   r   r   r   r   rA   torchao.float8.float8_utilsr   r	   torchao.float8.typesr
    torchao.quantization.granularityr   r   torchao.utilsr   r   rT   r   r+   intr6   dtyper   rE   r]   re   rk   rp   rt   rx   r   r   r   <module>r      s   < ;  K /
 
 Z    % 66>	4%,, U38_ 0 &*! %%% % 	%
 ++% 6"% 6
% % %P0H<<0HS	0H 
0H 	0H
 
0H 0H \\0HfH%,, H4 HU\\ d !..01 "	
! >>)*!D
78
	
r   