
    h(                         S SK r S SKJr  S SKJr  S SKJr  S SKJrJ	r	J
r
  S SKJrJr  S SKJr  S SKJrJr  S SKJr  S	 r " S
 S\5      r " S S\
5      r " S S\	5      rg)    N)DTensor)
DeviceMesh)ColwiseParallelPrepareModuleInputRowwiseParallel)ScalingType
e4m3_dtype)tensor_already_casted_to_fp8)NoopFwToFloat8BwDynamichp_tensor_to_float8_dynamic)GemmInputRolec                     U R                   [        R                  :H  =(       a    U R                  [        R                  :H  $ )N)scaling_type_inputr   DYNAMICscaling_type_grad_output)ms    _/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/float8_tensor_parallel.py(_float8_linear_supports_float8_allgatherr   "   s6     	
 3 33 	>&&+*=*==    c                      ^  \ rS rSrSr\S 5       r\S 5       rS\R                  S\
S\R                  4U 4S jjrS	rU =r$ )
Float8ColwiseParallel*   zc
Like `ColwiseParallel`, but with all-gather in float8. This
currently assumes tensorwise scaling.
c                 4   US   n[        U[        5      (       d  [        R                  " XTU SS9n[        U5      (       dB  [	        UUR
                  R                  R                  UR                  [        R                  S9nX:w  a  UR                  USS9nU$ Nr   F	run_checkgemm_input_roleT
placementsasync_op
isinstancer   
from_localr
   r   configcast_config_inputtarget_dtypelinear_mm_configr   INPUTredistributeinput_layoutsdesired_input_layoutsmodinputsdevice_meshinput_tensors         r   _prepare_input_fn'Float8ColwiseParallel._prepare_input_fn0   s    
 ay,00"--=EL ,L996

,,99$$ - 3 3	L 1'4404 5 L r   c                     UR                   U :w  a  UR                  U SS9n[        R                  " UUR                  UR
                  R                  R                  5      nU(       a  UR                  5       $ U$ NTr   	r    r*   r   applyr(   r%   cast_config_grad_outputr'   to_localoutput_layoutsuse_local_outputr.   outputsr0   s        r   _prepare_output_fn(Float8ColwiseParallel._prepare_output_fnJ   su     /**)D + G
 *//  JJ..;;
 &6w!B7Br   moduler0   returnc                    > SSK Jn  [        X5      (       d  [        S[	        U5       35      e[        X5      (       a  [        U5      (       d  [        S5      e[        TU ]!  X5      $ Nr   Float8Linearz.Expecting module to be Float8Linear but found unsupported	torchao.float8.float8_linearrE   r#   
ValueErrortyper   AssertionErrorsuper_applyselfr@   r0   rE   	__class__s       r   rM   Float8ColwiseParallel._apply\   c    =&//@fO  
 
:6BB //w~f22r    __name__
__module____qualname____firstlineno____doc__staticmethodr2   r>   nnModuler   rM   __static_attributes____classcell__rP   s   @r   r   r   *   sY    
  2 C C"3RYY 3Z 3BII 3 3r   r   c                      ^  \ rS rSrSr\S 5       r\S 5       rS\R                  S\
S\R                  4U 4S jjrS	rU =r$ )
Float8RowwiseParallelk   zc
Like `RowwiseParallel`, but with all-gather in float8. This
currently assumes tensorwise scaling.
c                 4   US   n[        U[        5      (       d  [        R                  " XTU SS9n[        U5      (       dB  [	        UUR
                  R                  R                  UR                  [        R                  S9nX:w  a  UR                  USS9nU$ r   r"   r+   s         r   r2   'Float8RowwiseParallel._prepare_input_fnq   s     ay,00"--=EL ,L996

,,99$$ - 3 3	L 1'4404 5 L r   c                     UR                   U :w  a  UR                  U SS9n[        R                  " UUR                  UR
                  R                  R                  5      nU(       a  UR                  5       $ U$ r5   r6   r:   s        r   r>   (Float8RowwiseParallel._prepare_output_fn   sp    
 /**nt*TG *//  JJ..;;
 &6w!B7Br   r@   r0   rA   c                    > SSK Jn  [        X5      (       d  [        S[	        U5       35      e[        X5      (       a  [        U5      (       d  [        S5      e[        TU ]!  X5      $ rC   rG   rN   s       r   rM   Float8RowwiseParallel._apply   rR   r   rS   rT   r_   s   @r   ra   ra   k   sY    
  . C C"3RYY 3Z 3BII 3 3r   ra   c                      ^  \ rS rSrSrSSSSS\R                  SS.U 4S jjrS rS\	R                  S	\S
\	R                  4U 4S jjrSrU =r$ )PrepareFloat8ModuleInput   a  
Like `PrepareModuleInput`, but with all-gather in float8. This
currently assumes tensorwise scaling.

The only difference from `PrepareModuleInput` is that
after we prepare the input DTensor, we cast the input to DTensor(Float8TrainingTensor)
This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
so that if there are multiple float8 users of the input activation, we perform fp8 allgather
only once.
FP8 Args:
  float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
      we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
  fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
      for the float8 cast. If not specified, we will search for the Float8Linear in the submodules
      and use the forward config from that module, in this case all module's forward config must be
      the same.
NF)r,   r-   input_kwarg_layoutsdesired_input_kwarg_layoutsr<   float8_dtypefwd_config_submodule_fqnc                   > [         TU ]  UUUUUS9  X`l        S U l        Xpl        U R                  [
        R                  :w  a  [        S5      eg )N)r,   r-   rl   rm   r<   zFPrepareFloat8ModuleInput only support casting to float8_e4m3fn for now)rL   __init__rn   r(   ro   torchfloat8_e4m3fnNotImplementedError)	rO   r,   r-   rl   rm   r<   rn   ro   rP   s	           r   rq   !PrepareFloat8ModuleInput.__init__   se     	'"7 3(C- 	 	
 ) $(@% 3 33%X  4r   c                 |   Ub  [        U[        5      (       a  UnO=[        U[        R                  5      (       d   S5       e[        R                  " XU4SS9n[        U[        U R                  [        R                  S9nUb  X4:w  a  UR                  U4S9nU R                  (       a  UR                  5       $ U$ U$ )Nz%expecting input to be a torch.Tensor!Fr   r   )r    )r#   r   rr   Tensorr$   r   r	   r(   r   r)   r*   r<   r9   )rO   inputmeshinput_layoutdesired_layoutdt_inps         r   _prepare_input_arg+PrepareFloat8ModuleInput._prepare_input_arg   s    #%)) !%66 ;6 !++,E 1%% - 3 3	F )l.L,,8I,J(,(=(=6??$I6ILr   r@   r0   rA   c                   > SSK Jn  U R                  b?  UR                  U R                  5      n[	        XC5      (       d   eUR
                  U l        OjUR                  5        HV  n[	        XS5      (       d  M  U R
                  c  UR
                  U l        M5  U R
                  UR
                  :X  a  MQ   S5       e   U R
                  c   e[        TU ]!  X5        U$ )Nr   rD   z?All the Float8Linear modules should have same linear_mm_config!)	rH   rE   ro   get_submoduler#   r(   modulesrL   rM   )rO   r@   r0   rE   
fwd_linearr.   rP   s         r   rM   PrepareFloat8ModuleInput._apply   s    =((4--d.K.KLJj7777$.$?$?D! ~~'c00,,4030D0D-#448L8LL ]L ( $$000v+r   )rn   ro   r(   )rU   rV   rW   rX   rY   rr   rs   rq   r}   r[   r\   r   rM   r]   r^   r_   s   @r   rj   rj      s[    * " $(((!% :6RYY Z BII  r   rj   )rr   torch.nnr[   torch.distributed._tensorr   torch.distributed.device_meshr   !torch.distributed.tensor.parallelr   r   r   torchao.float8.configr   r	    torchao.float8.distributed_utilsr
   #torchao.float8.float8_scaling_utilsr   r   %torchao.float8.float8_training_tensorr   r   r   ra   rj   rS   r   r   <module>r      s^      - 4  : I @>3O >3B<3O <3~_1 _r   