
    h                     x   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
JrJr  SSKJr  S\
R                  S\R                   SS4S	\R"                  S
\R$                  S\S\S\
S\S\\   S\S\	4S jjrS\S\S\\   4S jr\R.                  R0                   " S S\R2                  R4                  5      5       rg)z9
Utilities for scaling high precision tensors to float8.
    )OptionalN)ScalingGranularity)tensor_already_casted_to_fp8)Float8TrainingTensorGemmInputRoleLinearMMConfighp_tensor_and_scale_to_float8)tensor_to_scaleF	hp_tensorfloat8_dtypelinear_mm_configreduce_amaxgemm_input_rolescaling_granularityaxiswise_dimround_scales_to_power_of_2returnc	           	      D    [        U UUUUUU5      n	[        U U	UUUU5      $ )a  
Given a high precision tensor `hp_tensor`,
scales `hp_tensor` dynamically and returns a `Float8TrainingTensor` of the result.

Args:
    hp_tensor: the tensor to convert
    float8_dtype: the float8 dtype to use
    linear_mm_config: Defines the configuration for the scaled_mm for
      the 3 fwd/bwd gemms of linear
    reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
    gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
      the 3 fwd/bwd gemms of linear
    scaling_granularity: Defines the scaling granularity
    axiswise_dim: if axiswise granularity is used, defines the dim to scale across
    round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
)r
   r	   )
r   r   r   r   r   device_meshr   r   r   scales
             ]/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/float8_scaling_utils.pyhp_tensor_to_float8_dynamicr      sD    6 "E )     c                 .    U[         R                  L a  U $ g)a  
Convenience function which takes in an axiswise dim which is only relevant
for axiswise scaing, and a scaling type.  The output is pass-through
if scaling type is axiswise, and None otherwise.  This is done to keep the
logic from choosing the axiswise dim out of the scaling function.
N)r   AXISWISE)r   r   s     r   get_maybe_axiswise_dimr   K   s     0999r   c                   X    \ rS rSrSr\S\S\R                  4S j5       r	\S 5       r
Srg)	NoopFwToFloat8BwDynamicZ   zF
Forward: no-op
Backward: convert to float8_e5m2 with dynamic scaling
r   target_dtypec                     X l         X0l        U$ N)r   r    )ctxtensorr   r    s       r   forwardNoopFwToFloat8BwDynamic.forwarda   s      0'r   c                     [        U5      (       a  US S 4$ [        XR                  5      n[        UUU R                  U R                  [
        R                  5      nUS S 4$ r"   )r   r
   r    r	   r   r   GRAD_OUTPUT)r#   gradYgradY_scale
fp8_tensors       r   backward NoopFwToFloat8BwDynamic.backwardl   sd    '..$$$%e-=-=>2  %%

 4%%r    N)__name__
__module____qualname____firstlineno____doc__staticmethodr   torchdtyper%   r,   __static_attributes__r.   r   r   r   r   Z   sG    
  ) kk	  & &r   r   )r3   typingr   r5   torchao.float8.configr    torchao.float8.distributed_utilsr   %torchao.float8.float8_training_tensorr   r   r   r	   torchao.float8.float8_utilsr
   INPUT
TENSORWISETensorr6   boolintr   r   _dynamoallow_in_graphautogradFunctionr   r.   r   r   <module>rF      s     4 I  %2%8%8.@.K.K"&',+||++++ %+ 	+
 #+ ,+ 3-+ !%+ +\+ c] &enn55 & &r   