
    hm                     0   S SK JrJrJrJr  S SKrS SKJr  S SK	J
r
Jr  S SKJr  Sr\R                  R!                  5       =(       a    \R"                  R$                  SLr\R(                  \R*                  \R,                  \R.                  1r\R2                  " 5        S!S\R4                  S\R6                  S	\4S
 jj5       r\R2                  " 5       SS\R<                  S4S\R4                  S\S\S\\   S\R4                  4
S jj5       r \R2                  " 5       SS\R<                  SS4S\R4                  S\R6                  S\S\S\\   S	\S\R4                  4S jj5       r!S\R4                  S\R6                  4S jr"S\R4                  S\R4                  S\R4                  4S jr#S\R4                  S\R6                  S\\S4   4S jr$S r%S\S\S\4S jr&S\R4                  S\\\\   4   S\R4                  4S jr'S\R4                  4S  jr(g)"    )IterableOptionalTupleUnionN)AsyncCollectiveTensor
all_reduce)ScalingGranularityg-q=Famaxfloat8_dtyperound_scales_to_power_of_2c                 L   U R                  [        R                  5      n U[        ;   aZ  [        R                  " U5      R
                  [        R                  " U [        S9-  nUR                  [        R                  5      nO[        SU 35      eU(       a  [        U5      nU$ )zConverts the amax value of a tensor to the fp8 scale.
Args:
    amax: The amax value of the tensor.
    float8_dtype: The float8 dtype.
    round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
)minUnsupported float8_dtype: )totorchfloat64	FP8_TYPESfinfomaxclampEPSfloat32
ValueError_round_scale_down_to_power_of_2)r
   r   r   ress       U/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/float8_utils.pyamax_to_scaler      sy     775==!Dy kk,'++ekk$C.HHffU]]#5l^DEE!-c2J    xreduce_amaxscaling_granularityaxiswise_dimreturnc                 F   U[         R                  L a+  [        R                  " [        R                  " U 5      5      nONU[         R
                  L d   S5       eUc   S5       e[        R                  " [        R                  " U 5      USS9nU(       a  [        R                  " 5       (       as  Ub  UR                  5       OS nUc'  [        [        [        R                  " 5       5      5      OUn[        USU5      n[        U[        5      (       a  UR!                  5       nU$ )NunsupportedT)dimkeepdimMAX)r	   
TENSORWISEr   r   absAXISWISEr
   distis_initialized	get_grouplistrangeget_world_sizer   
isinstancer   wait)r   r    device_meshr!   r"   r
   pggroups           r   tensor_to_amaxr7   8   s     0;;;yy1&"&8&A&AAP=PA'66'zz%))A,L$G
 t**,,(3(?[""$T68jU4..012b$u-d12299;DKr   	hp_tensorc                 4    [        U UUUU5      n[        XqUS9$ )a  
Compute scaling factor for the given high precision tensor.

Args:
    hp_tensor: high precision tensor
    float8_dtype: the float8 dtype to use
    reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
    scaling_granularity: Defines the scaling granularity
    axiswise_dim: if axiswise granularity is used, defines the dim to scale across
    round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
)r   )r7   r   )r8   r   r    r4   r!   r"   r   r
   s           r   tensor_to_scaler:   U   s2    * D 7Q r   c                     U[         ;   aB  [        R                  " U5      R                  nU R	                  U* US9n U R                  U5      $ [        SU 35      e)a  Converts a tensor to a saturated fp8 tensor.

Note:
    The default behavior in PyTorch for casting to `float8_e4m3fn`
    and `e5m2` is to not saturate. In this context, we should saturate.
    A common case where we want to saturate is when the history of a
    tensor has a maximum value of `amax1`, and the current amax value
    is `amax2`, where `amax1 < amax2`. This is common when using delayed
    scaling.
)r   r   r   )r   r   r   r   r   r   r   )r   r   	max_values      r   to_fp8_saturatedr=   v   sV     y KK-11	GG
	G2ttL!!5l^DEEr   yc                     [         R                  R                  U 5      n[         R                  R                  X-
  5      nS[         R                  " X#-  5      -  $ )zComputes the error between two tensors in dB.

For more details see:
    https://en.wikipedia.org/wiki/Signal-to-noise_ratio

Args:
    x: The original tensor.
    y: The tensor to compare to the original tensor.
   )r   linalgvector_normlog10)r   r>   PsPns       r   compute_errorrF      sD     
	!	!!	$B		!	!!%	(BBG$$$r   tensor.c                 l   U[         ;   a!  [        R                  " U5      R                  nO[	        SU 35      eU R
                  R                  U R                  S9n[        R                  " U5      U:H  R                  5       R                  5       nUS:H  R                  5       R                  5       nXT4$ )zCalculate FP8 tensor stats

Args:
    tensor: The tensor to calculate stats for.
    float8_dtype: The float8 dtype.

Returns:
    A tuple containing the number of zeros and the number of max values.
r   )dtyper   )r   r   r   r   r   _datar   _orig_dtyper*   sumitem)rG   r   FP8_MAXtensor_orig_typenum_maxnum_zeros         r   fp8_tensor_statisticsrR      s     y ++l+//5l^DEE||V-?-?@yy)*g5::<AACG A%**,113Hr   c                 b    [        U 5      S:X  d   S5       eU S   U S   :  =(       a    U S   S:H  $ )N   z%is_row_major only supports 2D tensorsr      )len)strides    r   is_row_majorrX      s:    v;!DDD!9vay 3VAY!^3r   sizealignment_valuec                     SU S-
  U-  -   U-  $ )aY  
Returns the minimum alignment value that is greater than or equal to the given size.

Args:
    size: The size of the data to be aligned.
    alignment_value: The alignment value to be used.

Returns:
    int: The minimum alignment value that is greater than or equal to the given size.

Usage:
```
    >>> _get_min_alignment(10, 8)
    16
```
rU    )rY   rZ   s     r   _get_min_alignmentr]      s    " $(./?BBr   dimsc                 6   U R                  5       S:X  d   eU R                  u  p#[        U[        5      (       a  U4nSU;   a  [	        US5      OUnSU;   a  [	        US5      OUnXB-
  nXS-
  n[
        R                  R                  R                  U SUSU45      $ )a  
Pads a 2D tensor with zeros to ensure that its dimensions are multiples of 16, which is required `torch._scaled_mm`

Args:
    tensor: The tensor to pad.
    dims: Dimensions to pad.

Returns:
    torch.Tensor: The padded tensor.

Usage:
```
    >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=0).shape
    torch.Size([16, 10])
    >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=1).shape
    torch.Size([10, 16])
    >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=(0, 1)).shape
    torch.Size([16, 16])
```
rT   r      rU   )	r&   shaper2   intr]   r   nn
functionalpad)rG   r^   dim1dim2dim1_aligneddim2_alignedpad_dim1pad_dim2s           r   pad_tensor_for_matmulrl      s    . ::<1JD$w 459%dB/$L349%dB/$L "H"H88""6AxH+EFFr   scalec                     U R                   [        R                  :X  d   S5       e[        R                  " [        R                  " [        R
                  " U 5      5      5      $ )Nzscale must be float32 tensor)rI   r   r   exp2floorlog2)rm   s    r   r   r      s?    ;;%--'G)GG'::ekk%**U"3455r   )F))typingr   r   r   r   r   torch.distributeddistributedr,   )torch.distributed._functional_collectivesr   r   torchao.float8.configr	   r   cudais_availableversionhipIS_ROCMfloat8_e4m3fnfloat8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzr   no_gradTensorrI   boolr   r)   rb   r7   r:   r=   rF   rR   rX   r]   rl   r   r\   r   r   <module>r      sz   4 3    W 4 
**
!
!
#
E(9(9(E						  (-
,,++ !% 0  .@.K.K"&|| ,	
 3- \\ 8  .@.K.K"&',||++ 
 , 3- !% \\ @F FEKK F&%U\\ %ell %u|| %LL(-
38_,4
CS C3 C3 C(%GLL%G %c8C=&8 9%G
\\%GP65<< 6r   