
    h              	       <   S SK r S SKJr  S SKrS SKJr  S SKJs  Jr  S SK	J
r
JrJr  S SKJr  S SKJrJrJrJr  S SKJr  S SKJr  S SKJrJrJr  S S	KJr  S S
KJr   " S S\R@                  5      r! " S S\R@                  5      r"   SS\S\\\4   S\#S\#4S jjr$g)    N)Union)	ReplicateSharddistribute_tensor)
DeviceMesh)ColwiseParallelPrepareModuleInputRowwiseParallelparallelize_module)Float8LinearConfig)convert_to_float8_training)Float8ColwiseParallelFloat8RowwiseParallelPrepareFloat8ModuleInput)MXLinearConfig)	quantize_c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )FeedForward    zMLP based modelc                    > [         [        U ]  5         [        R                  " XS-  SS9U l        [        R                  " XS-  SS9U l        [        R                  " US-  USS9U l        g )N   F)bias)superr   __init__nnLinearw1w2out_projselfsize	__class__s     `/home/james-whalen/.local/lib/python3.13/site-packages/torchao/testing/training/dtensor_utils.pyr   FeedForward.__init__#   sT    k4)+))D(7))D(7		$(Du=    c                     [         R                  " U R                  U5      5      U R                  U5      -  nU R	                  U5      nU$ N)Fsilur   r   r   r!   xs     r$   forwardFeedForward.forward)   s8    FF4771:+MM!r&   )r   r   r   )	__name__
__module____qualname____firstlineno____doc__r   r-   __static_attributes____classcell__r#   s   @r$   r   r       s    > r&   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ToyModel/   c                 J   > [         [        U ]  5         [        U5      U l        g r(   )r   r8   r   r   ffnr    s     r$   r   ToyModel.__init__0   s    h&(t$r&   c                 $    U R                  U5      $ r(   r;   r+   s     r$   r-   ToyModel.forward4   s    xx{r&   r>   )r/   r0   r1   r2   r   r-   r4   r5   r6   s   @r$   r8   r8   /   s    % r&   r8   meshconfigcompileallgather_in_lowpc                 ^
   U R                   n[        n[        U[        5      (       a  [        n[        U5      R                  U5      n[        R                  " U5      nU" XS9  [        R                  " U5      n	U" XS9  [        R                  " U5      n
U" XS9  U(       d  [        n[        n[        nO[        n[        n[        n[        U	U U" 5       U" 5       U" 5       S.5      n	[        U
U U" [!        S5      [#        5       S9U" 5       U" 5       U" [!        S5      SS9S.5      n
[        R                  " U5      nU" XS9  U(       d  U" [!        S5      [#        5       S9nOU" [!        S5      [#        5       SS	9n[        UU UU" 5       U" 5       U" [!        S5      SS9S.5      nU(       aB  [$        R&                  " U	5      n	[$        R&                  " U
5      n
[$        R&                  " U5      n[$        R(                  " S
US
-  X%SS9n[$        R(                  " S
US
-  X%SS9nUR+                  5       nUR+                  5       n[-        UR+                  5       U [!        S5      /5      n[-        UR+                  5       U [!        S5      /5      nU	" U5      nUR/                  U5        U
" U5      nUR/                  U5        U" U5      nUR/                  U5        [$        R0                  R3                  UU5        [$        R0                  R3                  UR5                  5       U5        [$        R0                  R3                  U	R6                  R8                  R:                  R<                  U
R6                  R8                  R:                  R<                  5        [$        R0                  R3                  U	R6                  R>                  R:                  R<                  U
R6                  R>                  R:                  R<                  5        U" U5      nUR/                  U5        [$        R0                  R3                  UR5                  5       U5        [$        R0                  R3                  U	R6                  R8                  R:                  R<                  UR6                  R8                  R:                  R<                  5        [$        R0                  R3                  U	R6                  R>                  R:                  R<                  UR6                  R>                  R:                  R<                  5        g )N)rA   )ffn.w1ffn.w2ffn.out_proj   )input_layoutsdesired_input_layoutsF)output_layoutsuse_local_output)r;   rE   rF   rG   r   )rI   rJ   fwd_config_submodule_fqnr   )devicerequires_gradr   ) device_typer   
isinstancer   r   r8   tocopydeepcopyr   r
   r	   r   r   r   r   r   r   torchrB   randcloner   backwardtestingassert_closefull_tensorr;   r   weightgradr   )r@   rA   r"   rB   rC   rN   convert_model_func	toy_modeltoy_model_fp8tp_modelsp_modelcolwise_parallel_clsrowwise_parallel_clsprepare_input_cls	sp_model2prepare_inputx_fp32go_fp32x_fp32_tp_input
go_fp32_tpx_fp32_sp_input
go_fp32_sptp_outsp_out
global_outsp_out2s                             r$   &_test_lowp_mlp_tensor_parallelism_baserr   8   s    F 4&.))&!!&)IMM),M}4}}Y'Hx/}}Y'Hx/ ...444 "*,*,02	
H "$#Ahik +,*,0$Qx%		
H  i(Iy0)("++

 *("++%)
 # *,*,0$Qx%		
I ==*==*MM),	ZZ4!8TNFjjD1HdOGllnOJ'uQxjIO"7==?D58*EJo&F
OOJo&F
OOJv&J 	MMvz2	MMv113Z@	MMx||55::HLLOO<R<R<W<WX	MM$$))8<<+@+@+G+G+L+L (GZ 	MMw224jA	MM##Y]]%5%5%<%<%A%A 
MM$$))9==+A+A+H+H+M+Mr&   )r   FF)%rS   typingr   rU   torch.nnr   torch.nn.functional
functionalr)   torch.distributed._tensorr   r   r   torch.distributed.device_meshr   !torch.distributed.tensor.parallelr   r	   r
   r   torchao.float8r   "torchao.float8.float8_linear_utilsr   %torchao.float8.float8_tensor_parallelr   r   r   #torchao.prototype.mx_formats.configr   torchao.quantizationr   Moduler   r8   boolrr    r&   r$   <module>r      s          I I 4  . I 
 ? *")) ryy  
#~
~$n45~ 	~
 ~r&   