
    ΅i=              	       r   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKJrJ	r	  S SK
Jr  S SK Jr  S SKJr  S SKJrJr  S SKJr  S SKJrJrJrJrJr  S S	KJr  S SKrS SKJr  S SKJ r   S SK!J s  J"r#  S S
K$J%r%  S SK&J'r'  S SK(J)r)J*r*J+r,  S SK-J.r.  S SK/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7  S SK8J9r9  S SK:J;r;J<r<J=r=  S SK>J?r?J@r@JArA  S SKBJCrCJDrDJErEJFrF  S SKJGrGJHrH  S SKIJJrK  S SKLJMrMJNrNJOrOJPrP  S SKQJRrRJSrSJTrTJUrUJVrVJWrW  S SKXJYrY  SrZ\U(       a  Sr[Sr\\R                  R                  5       rZO8\V(       a  Sr[Sr\O,\W(       a  Sr[S r\\R                  R                  5       rZOS!r[S"r\S#rZ " S$ S%\5      r` " S& S'\5      ra " S( S)\ R                  \5      rcS*\ R                  S+\R                  S,\4S- jreS. rf  SzS*\ R                  S/\g4S0 jjrhS{S1 jriS2 rjS3 rkS|S*\ R                  S4\g4S5 jjrlS*\ R                  S6\g4S7 jrmS*\ R                  S8\g4S9 jrn " S: S;5      ro " S< S=\c5      rp " S> S?\c5      rq " S@ SA\q5      rr " SB SC\q5      rs " SD SE\c5      rt " SF SG\t5      ru " SH SI\ R                  5      rv " SJ SK\q5      rw " SL SM\ R                  5      rx " SN SO\ R                  5      rz " SP SQ\ R                  5      r{\ R                  SR\4SS j5       r}\ R                  ST\4SU j5       r~\ R                  SV\4SW j5       r\ R                  SX\4SY j5       r\ R                  SZ\4S[ j5       r\\ R                  S\\4S] j5       5       r\\ R                  S^\4S_ j5       5       r\\ R                  S`\4Sa j5       5       r\\ R                  Sb\4Sc j5       5       rSd\S,\Se\Sf\4Sg jr S}Sh\ R                  Si\ R                  Sj\\Sk4   4Sl jjr\GR                  " \WSm5       " Sn So\N5      5       r " Sp Sq\M5      rS~Sr\\   4Ss jjr " St Su\ R                  5      r " Sv Sw\ R                  5      r " Sx Sy\ R                  5      rg)    N)ABCabstractmethod)Callable)nullcontext)deepcopy)autoEnumwraps)Anycastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msset_rng_seed	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudancclzhpu:0hcclxpuxcclcpugloo   c                   0    \ rS rSr\" 5       r\" 5       rSrg)FSDPInitModeY    N)__name__
__module____qualname____firstlineno__r   NO_FSDP	RECURSIVE__static_attributes__rC       ]/home/james-whalen/.local/lib/python3.13/site-packages/torch/testing/_internal/common_fsdp.pyrA   rA   Y   s    fGIrK   rA   c                   >    \ rS rSr\" 5       r\" 5       r\" 5       rSrg)DEVICEInitModeb   rC   N)	rD   rE   rF   rG   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrJ   rC   rK   rL   rN   rN   b   s    FM6L6LrK   rN   c                       \ rS rSrSr\S\\R                  S4   4S j5       r	\S\R                  4S j5       r
\SS j5       r\\S	\S
\S\R                  4S j5       5       rSrg)FSDPTestModelk   zVThis defines the interface expected from all models used commonly for
FSDP unit tests.return.c                     g)z+Returns an input for the model as as tuple.NrC   selfdevices     rL   	get_inputFSDPTestModel.get_inputo        	rK   c                     g)z,Returns the loss given the input and output.NrC   )rY   inputoutputs      rL   get_lossFSDPTestModel.get_losst   r]   rK   Nc                     g)z<Runs the backward pass (e.g. including ``loss.backward()``).NrC   rY   losss     rL   run_backwardFSDPTestModel.run_backwardy   r]   rK   argskwargsc                      g)z&Initializes an instance of this model.NrC   )rh   ri   s     rL   initFSDPTestModel.init~   s     	rK   rC   rV   N)rD   rE   rF   rG   __doc__r   tupletorchTensorr[   ra   rf   staticmethodr   nnModulerk   rJ   rC   rK   rL   rT   rT   k   s     5s):#;        C 3 299   rK   rT   modelprocess_group	assert_fnc                 4   U R                  5        VVs/ s H$  u  p4X4R                  5       R                  5       4PM&     nnnUU R                  5        VVs/ s H$  u  pgXgR                  5       R                  5       4PM&     snn-  n[        R
                  " U5      n[        U5       V	s/ s H  n	SPM     n
n	[        R                  " XUS9  U
S   nUc   eU
SS  H(  nUc   e[        XSS9 H  u  u  pu  pU" X5        M     M*     gs  snnf s  snnf s  sn	f )z
All-gathers module states across ranks and calls ``assert_fn`` on each pair
of corresponding states from rank 0 and a nonzero rank. For example, if
``assert_fn`` is ``self.assertEqual()``, then this checks that all module
states are equal across ranks.
Ngroupr   r?   Tstrict)	named_parametersdetachr=   named_buffersdistget_world_sizerangeall_gather_objectzip)ru   rv   rw   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rL   _assert_module_statesr      s(    "'!7!7!9!9J 
\\^'')*!9   #(#6#6#8#8K 
mmo))+,#8  $$]3J ,-,aT,E-5]K8L###qr    #L EGQWab !F 
 .s   +D	+D*Dc                  6    [         R                  " [        5      $ N)rp   rZ   DEVICE_TYPErC   rK   rL   get_devtyper      s    <<$$rK   zero_buffersc                    U(       a  [         R                  " U 5      O	[        5       nU   U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     U(       aE  U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     SSS5        g! , (       d  f       M  = f! , (       d  f       M_  = f! , (       d  f       g= f)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrp   no_gradzero_buffers)ru   r   summon_fullctxr   r   s         rL   _zero_modelr      s     -8$
!
!%
([]C	%%'E ! ( --/]]_LLN %_ * 
  %_ 
s;   )C)C$;C)C0C)
CC)
C&!C))
C7c                     U(       d  U R                  [        5      n U(       a  U R                  5         U R                  5       $ r   )tor   half
state_dict)ru   cpu_offloadr   s      rL   _get_state_dictr      s.    %

rK   c           	      p    SR                  U Vs/ s H  o"b  U [        U5         OSPM     sn5      $ s  snf )Nr   none)joinstr)test_name_mappingrh   ss      rL   subtest_namer      s9    88IMNAm	3q6	"	?N Ns   3c                    UR                  5        H=  u  p#UR                  [        R                  " S5      :w  d  M+  UR                  5       X'   M?     U S:X  a  UOS /n[        R
                  " U5        [        [        [        [        R                  4   US   5      nU H  nX   R                  [        5      X'   M     U$ )Nr=   r   )itemsrZ   rp   r=   r   broadcast_object_listr   dictr   rq   r   r   )rankr   r   r   r   s        rL   _broadcast_state_dictr      s     (--/
<<5<<..%*YY[J" 0  19Z$/Eu%d3,-uQx8J 
!+!7!:!:;!G
 !rK   recursec                     [         R                  " XS9   [        [        U R	                  5       5      5      sSSS5        $ ! , (       d  f       g= f)a?  
Returns the full unsharded parameters of ``model``. Any FSDP-managed
parameters offloaded to CPU are moved to GPU in the returned list.

Args:
    recurse (bool): If ``False``, only unshards the parameters immediate to
        ``model``; if ``True``, recurses through the module hierarchy
        rooted at ``model``.
)r   N)r   r   r   listr   )ru   r   s     rL   get_full_paramsr      s4     
	 	 	8U--/01 
9	8	8s   "A
Amove_to_devicec                 >    U(       a  U R                  [        5      $ U $ r   )r   r   )ru   r   s     rL   _move_to_devicer      s    $2588K ==rK   	wrap_fsdpc                 2    U(       d  U $ [        U /UQ70 UD6$ r   r   )ru   r   rh   ri   s       rL   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrK   c                   H    \ rS rSrS\S\4S jrS\4S jrS\4S jrS rS	r	g
)DummyProcessGroup   r   sizec                     Xl         X l        g r   _rank_size)rY   r   r   s      rL   __init__DummyProcessGroup.__init__   s    

rK   rV   c                     U R                   $ r   )r   rY   s    rL   r   DummyProcessGroup.rank       zzrK   c                     U R                   $ r   )r   r   s    rL   r   DummyProcessGroup.size   r   rK   c                 B    [         R                  " 5       nS nXCl        U$ )Nc                  d    [         R                  R                  5       n U R                  S5        U $ )Nr?   )rp   futuresFuture
set_result)futures    rL   
get_future/DummyProcessGroup.allreduce.<locals>.get_future   s'    +0==+?+?+AFa MrK   )r   Mockr   )rY   rh   ri   	dist_waitr   s        rL   	allreduceDummyProcessGroup.allreduce   s     IIK		
  *rK   r   N)
rD   rE   rF   rG   intr   r   r   r   rJ   rC   rK   rL   r   r      s2    S  c c 	rK   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\   SS\R
                  S\S\S\\\\4      S\S\S\\R*                  \4   4S jj5       rS rSrU =r$ )TransformerWithSharedParamsi  rz   device_init_modeadd_bndeterministicc                   > [         TU ]  5         UR                  5       U l        UR                  5       U l        U(       a  [
        R                  " S5        SnSn[        R                  " XV5      U l	        [        R                  " USSSSS9U l        [        R                  " Xe5      U l        U R                  R                  U R                  l        U R                  SU R                  R                  R!                  U45      5        U R                  S	[
        R"                  " U R$                  [
        R&                  S
95        SU l        U(       a)  [
        R                  R+                  U R(                  5      O[
        R                  R-                  5       U l        U[0        R2                  :X  a  U R5                  [6        5      n U(       a  U R9                  5         g g )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rp   manual_seedrs   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrN   rP   r   r   eval)rY   rz   r   r   r   d_vocabr   	__class__s          rL   r   $TransformerWithSharedParams.__init__  s]    	JJL	**,a LL:>>  
 99W6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rK   c                 $   [         R                  " SU R                  -   5        [         R                  " SUS9R	                  SU R
                  5      n[         R                  " U R
                  S-  US9R	                  SU R
                  5      nX#4$ )Nr?      rZ      r7   )rp   r   r   arangeviewr   )rY   rZ   srctgts       rL   r[   %TransformerWithSharedParams.get_input1  sj    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGzrK   c                     U R                  U5      nX0R                  -   U R                  R                  U5      -   nU R                  U5      nU R	                  U5      nU R                  X45      nU R                  U5      $ r   )r   r   r   type_asr   r   r   )rY   src_idstgt_idsr	  r
  xs         rL   forward#TransformerWithSharedParams.forward7  sr    (OO#d&6&6&>&>s&CC(ggclS&""rK   c                     Uu  p4[         R                  R                  UR                  SUR	                  S5      5      UR                  S5      SS9$ )Nsum)	reduction)rs   
functionalcross_entropyr  r   )rY   r_   r`   r   r
  s        rL   ra   $TransformerWithSharedParams.get_loss?  sG    }}**KKFKKO,chhrle + 
 	
rK   c                 $    UR                  5         g r   backwardrd   s     rL   rf   (TransformerWithSharedParams.run_backwardE      rK   fsdp_init_modefsdp_kwargsrV   c                 x   Uc  0 nU[         R                  :X  a)  [        U [        5      (       a  U S   nOU n[	        XbXT5      $ U[         R
                  :X  a  SU;  a  [        [        [        15      nOUR                  S5      nSU;   a?  US   [        R                  [        R                  1;   a  [        U [        5      (       d  SnOU n[        U [        5      (       a  U S   n	OU n	[	        XXT5      n
[        U
U4SU0UD6nU[        R                  :X  a  UR!                  ["        5      nU$ [%        SU 35      e)a  
Initializes a :class:`TransformerWithSharedParams` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps with
        top-level FSDP. By default, the top-level FSDP uses the
        ``ModuleWrapPolicy`` for encoder and decoder layers, but a
        different auto wrap policy may be specified via
        ``fsdp_kwargs``.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    add_bn (bool): Whether to include batch norm in the model.
Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )rA   rH   
isinstancero   r   rI   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rN   rQ   r   r   
ValueError)rz   r  r   r   r   r   pgr"  fsdp_pg
tformer_pgm
fsdp_models               rL   rk    TransformerWithSharedParams.initH  sW   6 K\111%''1X.f  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%00%''"1X
"
+fA  "2 	J  >#>#>>']];7
77GHIIrK   c                     U R                   /$ r   )r   r   s    rL   get_ignored_modules/TransformerWithSharedParams.get_ignored_modules  s      !!rK   )r   r   r   r   r   r   r   )NFT)rD   rE   rF   rG   r   ProcessGrouprN   boolr   r[   r  ra   rf   rr   rA   r   r   r   r   r   rs   rt   r   rk   r1  rJ   __classcell__r  s   @rL   r   r     s    (  ( )( 	(
 (T#
 
 15#KJ  KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ" "rK   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\  SS\R
                  S\S\S\\\\4      S\S\R(                  4S jj5       rSrU =r$ )NestedWrappedModulei  rz   r   r   r   c                   >^^^ [         TU ]  5         TR                  5       U l        TR                  5       U l        U[
        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   layerr   rz   r   s    rL   _maybe_wrap1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrK   r   r   r7   r   )r   r   r   r   r   rN   rP   rp   r   rs   
Sequentialr   r   module	rY   rz   r   r   r   r   r   r=  r  s	    ``  `  rL   r   NestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rK   c                 v    [         R                  " SU R                  -   5        [         R                  " SSUS94$ )Nr?   r7   r   r  )rp   r   r   randrX   s     rL   r[   NestedWrappedModule.get_input  s.    !dii-(

1a/11rK   c                 $    U R                  U5      $ r   rA  rY   r  s     rL   r  NestedWrappedModule.forward      {{1~rK   c                 &    UR                  5       nU$ r   )r  rY   r_   r`   re   s       rL   ra   NestedWrappedModule.get_loss  s    zz|rK   c                 $    UR                  5         g r   r  rd   s     rL   rf    NestedWrappedModule.run_backward  r  rK   r  r   rV   c                    Uc  0 nU[         R                  :X  a  [        U SUUS9$ U[         R                  :X  a;  [        U 4SUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP but not the top-level module. The model may
        later be wrapped with a top-level FSDP external to this method
        if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
Fr   r   r   Tr$  )	rA   rH   r8  rI   rN   rQ   r   r   r)  )rz   r  r   r   r   r.  s         rL   rk   NestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrK   rA  r   r   NF)rD   rE   rF   rG   r   r3  r4  rN   r   r[   r  ra   rf   rr   rA   r   r   r   r   rs   rt   rk   rJ   r5  r6  s   @rL   r8  r8    s    
  
 
 )	

 
@2 
 15#+J  +J$+J )+J d38n-	+J
 +J 
+J +JrK   r8  c                   v   ^  \ rS rSr\  S	S\R                  S\S\S\	\
\\4      S\4
U 4S jjj5       rSrU =r$ )
AlwaysWrapNestedWrappedModulei  rz   r  r   r   r   c                 :  > [         [        [        ]   U [        R                  UUUS9nU[        R                  :X  a  U$ U[        R
                  :X  aH  U=(       d    0 n[        U4S[        0UD6nU[        R                  :X  a  UR                  [        5      nU$ g)z
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
policy.
)rz   r  r   r   r   r"  N)r   rW  rk   rA   rH   rI   r   r   rN   rQ   r   r   )rz   r  r   r   r   ru   r.  r  s          rL   rk   "AlwaysWrapNestedWrappedModule.init  s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rK   rC   rU  )rD   rE   rF   rG   rr   r   r3  rA   rN   r   r   r   r   r4  rk   rJ   r5  r6  s   @rL   rW  rW    s^    
 15#  $ ) d38n-	
  rK   rW  c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjr\	SS j5       r
\	  SS\R
                  S\S\S	\\\\4      S\4
S
 jj5       rSrU =r$ )NonUniformReqGradNWMi  rz   r   r   r   c                   >^^^ [         [        U ]  5         TR                  5       U l        TR	                  5       U l        U[        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        R                  " [        [        R                  " SS5      U5      [        [        R                  " SS5      U5      5      5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   r;  s    rL   r=  2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap.  r?  rK   r   r   r7   r   )r   r8  r   r   r   r   rN   rP   rp   r   rs   r@  r   r   rA  rB  s	    ``  `  rL   r   NonUniformReqGradNWM.__init__  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rK   c                     U R                  5        H3  u  p#[        R                  " X5      (       a  M"  UR                  S5        M5     g rU  )r}   rematchrequires_grad_)ru   req_grad_masknps       rL   _set_nonuniform_req_grad-NonUniformReqGradNWM._set_nonuniform_req_gradE  s4    **,DA88M--  ' -rK   r  r   c                    [         R                  " S5      nU[        R                  :X  a#  [	        U SUUS9n[        R                  Xe5        U$ U[        R                  :X  aU  Uc  0 n[	        U 4SUUS.UD6nU[        R                  :X  a  UR                  [        5      n[        R                  Xu5        U$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
container to enable the desired non-uniform ``requires_grad``
``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
init modes, freezes all parameters except the last two to validate
``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
FSDP ``use_orig_params=True`` mode.
zmodule\.2.*\.1.*FrR  Tr$  )ra  compilerA   rH   r[  rg  rI   rN   rQ   r   r   r)  )rz   r  r   r   r   req_grad_pattern	ddp_modelr.  s           rL   rk   NonUniformReqGradNWM.initK  s    ( ::&9:\111,!1+	I !99)V|555" -!1+	
 J  >#>#>>']];7
 99*W77GHIIrK   rT  rm   rU  )rD   rE   rF   rG   r   r3  r4  rN   r   rr   rg  rA   r   r   r   r   rk   rJ   r5  r6  s   @rL   r[  r[    s    (
  (
 (
 )	(

 (
T ( (
 
 15#+J  +J$+J )+J d38n-	+J
 +J +JrK   r[  c                      ^  \ rS rSrSrS\R                  S\S\4U 4S jjrS r	S r
S	 rS
 r\S\\   S\S\S\S\4
S j5       rSrU =r$ )ModuleWithDelayiz  zThis class wraps a :class:`FSDPTestModel` to optionally add a delay
after computing the loss and/or before the gradient reduction.rA  delay_after_loss_msdelay_before_reduction_msc                 F   > [         TU ]  5         X l        X0l        Xl        g r   )r   r   rp  rq  rA  )rY   rA  rp  rq  r  s       rL   r   ModuleWithDelay.__init__~  s!     	#6 )B&rK   c                 8    U R                   R                  U5      $ r   )rA  r[   rX   s     rL   r[   ModuleWithDelay.get_input  s    {{$$V,,rK   c                 $    U R                  U5      $ r   rH  rI  s     rL   r  ModuleWithDelay.forward  rK  rK   c                 b   U R                   R                  X5      nU R                  S:  a  [        (       d  [        (       a%  [
        R                  " U R                  S-  5        U$ [        (       a=  [        R                  R                  [        U R                  [        5       -  5      5        U$ Nr     )rA  ra   rp  r4   r5   timesleepr3   rp   r8   _sleepr   r1   rM  s       rL   ra   ModuleWithDelay.get_loss  s}    {{##E2##a'x88

433d:;  

!!#d&>&>ARAT&T"UVrK   c                    ^ ^ [         R                  R                  mUU 4S jn[        R                  " SU5         T R
                  R                  U5        S S S 5        g ! , (       d  f       g = f)Nc                  8  > TR                   S:  a  [        (       a>  [        R                  R	                  [        TR                   [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR                   S-  5        T" U 0 UD6$ ry  )rq  r3   rp   r8   r}  r   r1   r4   r5   r{  r|  )rh   ri   orig_reduce_scatterrY   s     rL   _delayed_reduce_scatter=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sq    --19JJ%%D::=N=PPQ XJJt==DE&777rK   z'torch.distributed.reduce_scatter_tensor)rp   distributedreduce_scatter_tensorr   patchrA  rf   )rY   re   r  r  s   `  @rL   rf   ModuleWithDelay.run_backward  sR    #//EE	8 ZZ57N
 KK$$T*
 
 
s   A  
A.module_class
model_argsmodel_kwargsc                <    [        U R                  " U0 UD6UU5      $ )a  
Args:
    module_class (Type[FSDPTestModel]): Wrapped module class to which
        to add delays.
    model_args: Positional arguments forwarded to the ``module_class``
        ``init()``.
    delay_after_loss_ms (int): Delay after computing the loss/before
        the optimizer step (in ms).
    delay_before_reduction_ms (int): Delay before reduce-scattering
        gradients (in ms).
    model_kwargs: Keyword arguments forwarded to the ``module_class``
        ``init()``.
)ro  rk   )r  rp  rq  r  r  s        rL   rk   ModuleWithDelay.init  s*    * z:\:%
 	
rK   )rp  rq  rA  )rD   rE   rF   rG   rn   rs   rt   r   r   r[   r  ra   rf   rr   typerT   r   rk   rJ   r5  r6  s   @rL   ro  ro  z  s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rK   ro  c                       \ rS rSr\\R                  SSSS4S\R                  S\	S\S\
\\\4      S	\S
\S\4S jj5       rSrg)NestedWrappedModuleWithDelayi  NFr   rz   r  r   r   r   rp  rq  c                 >    [         R                  [        U UUUUUUS9$ )Nrz   r  r   r   r   rp  rq  )ro  rk   r8  r  s          rL   rk   !NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rK   rC   )rD   rE   rF   rG   rr   rN   rQ   r   r3  rA   r   r   r   r   r4  r   rk   rJ   rC   rK   rL   r  r    s     ,:+F+F04##$)*
  
$
 )
 d38n-	

 
 !
 $'
 
rK   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DummyDDPi  c                 .   > [         TU ]  5         Xl        g r   )r   r   rA  )rY   rA  r  s     rL   r   DummyDDP.__init__  s    rK   c                 &    U R                   " U0 UD6$ r   rH  rY   rh   ri   s      rL   r  DummyDDP.forward  s    {{D+F++rK   rH  rD   rE   rF   rG   r   r  rJ   r5  r6  s   @rL   r  r    s    , ,rK   r  c                      ^  \ rS rSrS\R
                  S\S\S\S\4
U 4S jjr	S r
S	 r\   SS\R
                  S
\S\S\\\\4      S\S\4S jj5       rSrU =r$ )MixtureOfExpertsi  rz   r   r   delay_before_free_msr   c                   > [         TU ]  UUUUS9  Xl        X@l        X l        U[
        R                  :H  U l        U(       a#  [        R                  " SU R                  -   5        SnSnSn	[        [        R                  " Xx5      U R                  5      n
[        S U
R                  5        5       5      U l        U
R                  5        H
  nSUl        M     U(       a  [        R                  " S5        [        [        R                  " X5      U R                  5      nU(       aF  [        R$                  R'                  UR                  5       /5      n[)        X40 UD6n
[)        X40 UD6n[        R*                  " [        [        R                  " X5      U R                  5      UU
[        [        R                  " X5      U R                  5      5      U l        g )	N)rz   r   r   r   *   r   r  r   c              3   @   #    U  H  oR                  5       v   M     g 7fr   )numel).0rf  s     rL   	<genexpr>,MixtureOfExperts.__init__.<locals>.<genexpr>  s     $L8K1WWYY8K   Tr   )r   r   rz   r  r   rN   rP   r   rp   r   r   r   rs   r   r  r   num_expert_paramsexpertr  	new_groupr   r@  rA  )rY   rz   r   r   r  r   r   d_expertd_sharedd_inputr  rf  sharedexpert_groupr  s                 rL   r   MixtureOfExperts.__init__  s{    	-'	 	 	
 
$8!"..2N2NNb499n- 8!>@S@ST!$$L8I8I8K$L!L""$AAH % a  8!>@S@ST ,,66L &>+>F&7;7FmmBIIg8$:M:MNBIIh8$:M:MN	
rK   c                   ^ ^ T R                   S:  a  T R                  S   n[        U[        5      (       ag  [        R
                  R                  R                  R                  mUU 4S jn[        R                  " SU5         T R                  U5      sS S S 5        $ T R                  U5      $ ! , (       d  f       N= f)Nr   r   c                    > [         (       a>  [        R                  R                  [	        TR
                  [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR
                  S-  5        T" U 0 UD6$ )Nrz  )r3   rp   r8   r}  r   r  r1   r4   r5   r{  r|  )rh   ri   orig_reshardrY   s     rL   _delayed_reshard2MixtureOfExperts.forward.<locals>._delayed_reshard"  sc     y

)) 9 9<M<O OP "XX

4#<#<t#CD'888rK   z.torch.distributed.fsdp._runtime_utils._reshard)r  rA  r%  r   rp   r  fsdp_runtime_utils_reshardr   r  )rY   r  r  r  r  s   `   @rL   r  MixtureOfExperts.forward  s    $$q([[^F&$''$0055DDMM9 ZZDFV  ;;q> 
 {{1~ s   B//
B=c                    UR                  5         U R                  (       d  [        R                  " 5          U R	                  5        H|  n[        US5      (       a  M  UR                  c  M%  UR                  R                  U R                  5        [        R                  R                  UR                  U R                  S9  M~     S S S 5        g g ! , (       d  f       g = f)Nr  ry   )r  r   rp   r   r   hasattrgraddiv_r   r  
all_reducerz   )rY   re   rf  s      rL   rf   MixtureOfExperts.run_backward4  s    ~~*Aq(++ vv)DOO4))44QVV4::4N + !  s   2C-AC
C r  r   c                 
   Uc  0 nU[         R                  :X  a  [        U SUUUS9$ U[         R                  :X  a<  [        U 4SUUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`MixtureOfExperts` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP, including the expert and shared layers, but
        not the top-level module. The model may later be wrapped with a
        top-level FSDP external to this method if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    delay_before_free_ms (int): Delay before resharding expert
        parameters in the forward pass (in ms).
F)r   r   r  r   Tr$  )	rA   rH   r  rI   rN   rQ   r   r   r)  )rz   r  r   r   r   r  r.  s          rL   rk   MixtureOfExperts.init@  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrK   )r  rz   rA  r   r  r   )NFr   )rD   rE   rF   rG   r   r3  r4  rN   r   r   r  rf   rr   rA   r   r   r   r   rk   rJ   r5  r6  s   @rL   r  r    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 15#$%0J  0J$0J )0J d38n-	0J
 0J "0J 0JrK   r  c                      ^  \ rS rSr SSSSS.S\S\\R                     S\S	\S
\4
U 4S jjjjr	S\R                  S\R                  4S jrS rSrU =r$ )MLPit  TFr7   )biaswith_bufferdim_multiplierdimrZ   r  r  r  c                   > [         TU ]  5         [        R                  " XU-  X#S9U l        [        R                  " XQ-  XUS9U l        U(       a'  U R                  S[        R                  " U4US95        g S U l	        g )N)rZ   r  r   r  )
r   r   rs   r   in_projout_projr   rp   randnr   )rY   r  rZ   r  r  r  r  s         rL   r   MLP.__init__u  sf     	yys&:6U		."6QUV  5;;vf+MNDKrK   r  rV   c                     U R                  U5      n[        R                  " U5      nU R                  U5      n[        R                  " U5      nU R                  b  X R                  -   nU$ r   )r  Frelur  r   )rY   r  zs      rL   r  MLP.forward  sQ    LLOFF1IMM!FF1I;;"KKArK   c                     U R                   b4  [        R                  R                  R	                  U R                   5        g g r   )r   rp   rs   rk   normal_r   s    rL   reset_parametersMLP.reset_parameters  s+    ;;"HHMM!!$++. #rK   )r   r  r  r   )rD   rE   rF   rG   r   r   rp   rZ   r4  r   rq   r  r  rJ   r5  r6  s   @rL   r  r  t  s     *.
 ! &
    " %,, / /rK   r  c                   V   ^  \ rS rSrSS.S\S\4U 4S jjjrS\S\S	\S
S 4S jrSr	U =r
$ )MLPStacki  F)with_seq_parallelmlp_dimr  c                   > [        USS9[        U5      [        USS9/nU(       a$  UR                  [        R                  " USS95        [        TU ]  " U6   X l        g )N   )r  Fr  )r  appendrs   	LayerNormr   r   r  )rY   r  r  modulesr  s       rL   r   MLPStack.__init__  sV     *L*	$
 NN2<<e<='"!2rK   tp_meshdp_meshuse_activation_checkpointingrV   c           
         [        SS9[        SS9[        SS9[        SS9[        SS9U R                  (       a  [        [        S5      S9O	[        5       S.nU R                  (       a  [	        SS9US'   [        XUS9  U  HD  n[        U[        R                  5      (       a  M$  U(       a  [        U5        [        U4S	U0UD6  MF     [        U 4S	U0UD6  U $ )
NF)use_local_outputr?   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r%  rs   r  r   r   )rY   r  r  r  r   r  rA  s          rL   parallelizeMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4GWXF&",,//+6"<W<<  	D6w6+6rK   )rD   rE   rF   rG   r   r4  r   r   r  rJ   r5  r6  s   @rL   r  r    sN    BG 
3 
34 
3 
3  '+	 
 rK   r  c                      ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\
\\R                  \R                  4   \R                  4   4S jrS	rU =r$ )DoubleLineari  z
This can be used for returning multiple outputs from a module
(``use_second_linear=True``) or for having an unused module (``False``).
r  use_second_linearc                    > [         TU ]  5         [        R                  " X5      U l        [        R                  " X5      U l        [        R                  " 5       U l        X l        g r   )	r   r   rs   r   lin1lin2ReLUr  r  )rY   r  r  r  s      rL   r   DoubleLinear.__init__  sA    IIc'	IIc'	GGI	!2rK   r  rV   c                     U R                   (       a@  U R                  U R                  U5      5      U R                  U R                  U5      5      4$ U R                  U R                  U5      5      $ r   )r  r  r  r  rI  s     rL   r  DoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rK   )r  r  r  r  T)rD   rE   rF   rG   rn   r   r4  r   rp   rq   r   ro   r  rJ   r5  r6  s   @rL   r  r    s^    
3C 3D 3 3''	uU\\5<</0%,,>	?' 'rK   r  new_all_gather_into_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   all_gather_into_tensorbarrier)r  orig_all_gathers     rL   patch_all_gatherr    sN     11OLLN"<D6&5# 	&5#   1A>A !A>"A;;A>new_foreach_all_gatherc              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rp   r  r  _fully_shard_fsdp_param_groupforeach_all_gatherr   r  )r  orig_foreach_all_gathers     rL   patch_foreach_all_gatherr    s      	++==PP  	LLN 
''99L
# 	++==P 	# 	++==P    BDC A	DA
DDnew_foreach_reducec              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rp   r  r  r   r  foreach_reducer   r  )r  orig_foreach_foreach_reduces     rL   patch_foreach_reducer
    s      	++==LL   	LLN 
''99H
' 	++==L 	' 	++==Lr  new_reduce_scatter_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  r  s     rL   patch_reduce_scatterr    sO     44LLN!:D9%8" 	%8"r  new_all_reducec              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  orig_all_reduces     rL   patch_all_reducer    sI     ooOLLN$DO*) 	)r  new_unshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   unshardr   r  )r  orig_unshards     rL   patch_unshardr  $  P      "))LLLN(N.!- 	!-r  new_reshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   reshardr   r  )r  r  s     rL   patch_reshardr  1  r  r  new_post_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   post_backwardr   r  )r  orig_post_backwards     rL   patch_post_backwardr   >  sQ      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   r  r   r  )r!  orig_backwards     rL   *patch_register_post_backward_hook_backwardr$  K  sS      199MLLN,8 )>0=$- 	0=$-r  r  rh   ri   c                     [        U5      S:  a  US   nOSU;   a  US   nO[        SU SU 35      eU" U5        U" U0 UD6$ )Nr   r`   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsr  rw   rh   ri   r`   s         rL   reduce_scatter_with_assertr)  X  sa     4y1}a	V	!;D6F8T
 	
 f///rK   replicated_modulesharded_moduleprefixes_to_ignore.c                    [        UR                  5       UR                  5       SS9 GH  u  u  pEu  pgUnU H  n	UR                  U	S5      nM     U R                  XH5        U R	                  U[
        5        [        U[
        5      (       d   eUR                  UR                  p[        U5      [        S5      [        S5      4:X  a  [        S5      e[        XZU5      nU R                  UR                  5       UR                  5       5        UR                  c  U R                  UR                  5        GM  U R!                  UR                  5        [        UR                  X5      nU R	                  UR                  [
        5        [        UR                  [
        5      (       d   eU R                  UR                  R                  5       UR                  5       5        GM     g )NTr{    r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)r   r}   replaceassertEqualassertIsInstancer#   r%  r  
placementsro   r$   r'  r"   to_localr  assertIsNoneassertIsNotNone)r(  r*  r+  r,  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r2  sharded_ref_paramsharded_ref_grads                 rL   check_sharded_parityr>  k  s    OR**,'')OJ+-Jl
 *(F!3!;!;FB!G )<]G4-1111(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BDU]//9-,,g6666**3357G7P7P7RS5OrK   znot-support-multithreadc                   J   ^  \ rS rSr\S 5       rU 4S jrS rS rS r	Sr
U =r$ )FSDPTestMultiThreadi  c                     [         $ r   DEVICE_COUNTr   s    rL   r   FSDPTestMultiThread.world_size      rK   c                 B   > [         TU ]  5         U R                  5         g r   )r   setUp_spawn_threadsrY   r  s    rL   rG  FSDPTestMultiThread.setUp  s    rK   c                      [        U /UQ70 UD6$ r   r.   r  s      rL   r.    FSDPTestMultiThread.run_subtests      D242622rK   c                 @    [         R                  R                  5         g r   rp   _dynamoresetr   s    rL   perThreadSetUp"FSDPTestMultiThread.perThreadSetUp      rK   c                 @    [         R                  R                  5         g r   rP  r   s    rL   perThreadTearDown%FSDPTestMultiThread.perThreadTearDown  rU  rK   rC   )rD   rE   rF   rG   propertyr   rG  r.   rS  rW  rJ   r5  r6  s   @rL   r@  r@    s.     3 rK   r@  c            $         ^  \ rS rSrU 4S jr\S 5       r\S 5       r\S\4S j5       r	\S 5       r
S rS	 rS
 rS r\S 5       r       S)S\R$                  S\S\S\S\\   S\S\\   S\S\S\\\\4      4S jjrSSS\" 5       SSSSSSSSS4S\\   S\S\S\\    S \S\S!\S"\\!   S#\\"   S\\   S$\S%\S\S\S&\\\\4      S\\\\4      4 S' jjr#S(r$U =r%$ )*FSDPTesti  c                 h   > [         TU ]  5         S[        R                  S'   U R	                  5         g )N0TORCH_NCCL_DESYNC_DEBUG)r   rG  osenviron_spawn_processesrI  s    rL   rG  FSDPTest.setUp  s)     14

,-rK   c                     [         $ r   rB  r   s    rL   r   FSDPTest.world_size  rE  rK   c                 >    [         R                  R                  5       $ r   )r   distributed_c10d_get_default_groupr   s    rL   rv   FSDPTest.process_group  s    $$7799rK   rV   c                     grU  rC   r   s    rL   destroy_pg_upon_exitFSDPTest.destroy_pg_upon_exit  s     rK   c                 *    [          U R                   3$ r   )r0   	file_namer   s    rL   init_methodFSDPTest.init_method  s    t~~.//rK   c                 :    U R                  X!R                  5        g r   )r0  r   )rY   r.  r   s      rL   _check_cpu_offloadFSDPTest._check_cpu_offload  s    &<&<=rK   c                 :    U R                  X!R                  5        g r   )r0  backward_prefetch)rY   r.  rt  s      rL   _check_backward_prefetch!FSDPTest._check_backward_prefetch  s    *,H,HIrK   c                 :    U R                  X!R                  5        g r   )r0  forward_prefetch)rY   r.  rx  s      rL   _check_forward_prefetch FSDPTest._check_forward_prefetch  s    )+F+FGrK   c                      [        U /UQ70 UD6$ r   rL  r  s      rL   r.   FSDPTest.run_subtests  rN  rK   c                    U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35        [
        R                  R                  5       UR                  :  a4  [        R                  " [        SUR                   3   R                  5         U(       a^  [
        R                  R                  R                  R                  R!                  5       n["        R$                  " SUR                  UUS9  OC["        R$                  " UR&                  [(        [+        UR                  5      UR                   S9   S n
UR                   [0        -  n[2        (       d  [4        (       a  [
        R                  R7                  U5        U/n
["        R8                  " U
S9  [
        R:                  R=                  5         [?        5         URA                  X$5        [
        R:                  R=                  5         ["        R8                  " U
S9  ["        RB                  " 5         g ! [,         a@  n	S	U	R.                  S
   ;   a'  [        R                  " [        S   R                  5        e S n	A	ff = f)Nfake_pgFzdist init r=z, world=z
multi-gpu-fake)backendr   r   store)rn  r  r   r   	recompiler   backend_unavailable)
device_ids)"r   rm  getprintr   rp   acceleratordevice_countsysexitr/   	exit_codetesting	_internalr  r~  	FakeStorer   init_process_grouprn  DISTRIBUTED_BACKENDr   RuntimeErrorrh   rC  r3   r5   set_device_indexr  rQ  rR  r2   run_testdestroy_process_group)r(  r   	test_namerm  piperi   rY   r~  r  er  	device_ids               rL   _runFSDPTest._run  s   9~	"**Y.TYYKx/@AB))+doo=HHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s    -A$H7 AH7 7
J;I<<JNFru   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         U=(       a    UR                   n[        UR                  5       5      R                  nU
c  0 n
[	        SSU0U
D6n[
        R                  R                  UR                  5       USS9n[        U5       GHg  nUR                  5         [
        R                  R                  [        US9   UR                  R                  [
        R                  " [        5      5      nU	(       d  U(       aW  [        U[         5      (       dB  [        U[
        R"                  5      (       a  UR%                  5       nO['        S U 5       5      nU" U6 nU(       ap  [        U[         5      (       a[  UR(                  [*        ;  aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR                  R/                  UU5      R1                  U5      nS S S 5        UR3                  W5      nU(       d-  U	(       d&  UR4                  [
        R6                  :X  d   S5       eOU	(       a+  U R-                  UR4                  [
        R8                  5        Ok[        U[         5      (       a,  Uc   eU R-                  UR4                  UR:                  5        O*U R-                  UR4                  [
        R6                  5        UR                  R=                  U5        U(       a\  [        U[         5      (       aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR?                  U5        URA                  5         U(       d  GM	  URC                  5       RE                  5        VVs0 s H  u  nnUURG                  5       _M     nnn[I        U5        URK                  U5        GMj     [        U[         5      (       a  URM                  [N        RP                  5        WRS                  5       $ ! , (       d  f       GNA= fs  snnf )	Nenabledg?)r  momentum)r  c              3   @   #    U  H  oR                  5       v   M     g 7fr   )r   )r  r  s     rL   r  4FSDPTest._train_for_several_steps.<locals>.<genexpr>   s     %>1ffhhr  r=   zeloss data type should be float32, as the original                     parameter data type is float32.rC   )*offload_paramsnextr   rZ   r   rp   optimSGDr   	zero_gradampr  r   rA  r[   r%  r   rq   r   ro   r#  r   r0  ra   r   scaler   float32float16param_dtyperf   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEr~   )rY   ru   r  r  r  r  r  r  r  r  r  cpu_offload_paramsmodel_devicesharded_grad_scalerr  r   r_   r`   rf  re   kvr   s                          rL   _train_for_several_steps!FSDPTest._train_for_several_steps  s/    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy!AOO##K#B..u||K/HI _Zt=T=T!%66 %

 %%>%> > '"5$// //>? #--/((5<<3FG 0 ||,,UF;>>|L- C. ',,T2D"=zzU]]2 52
 !$$TZZ?t,,*666$$TZZ1L1LM$$TZZ?LL%%d+!j&=&=))+A$$QXXu||E/BC ,  $$U+&&(z7<7G7G7I7O7O7QR7Qtq!al7Q
R E"%%j1s "v eT"" 2 23{{}w CBf Ss   9E P3-Q3
Q	r   Tmodel_classr  r   ref_init_fn	num_itersr   rt  r#  rx  use_orig_paramsinit_kwargsc                    U[         R                  :w  d   S5       eUc  0 nSnU R                  R                  5       nUR                  " U R                  [         R                  [
        R                  4SS0UD6nUc-  [        (       a  [        U[        /[        S9nO[        UU/US9nOU" U5      nU(       a  UR                  5       nU R                  UUU
SLUUU
UUUS9	n[        UR                  5       5      nUR                  UUU	U
UUS.5         UR                  " U R                  UUU4SS0UD6n[%        U[&        5      (       d  ['        UU R                  40 UD6nU(       a  UR                  5       nU[
        R(                  :X  a  UR+                  [        5      nUSL=(       a    UR,                  nU=(       a    U[
        R(                  :H  nU=(       a    U[
        R(                  :g  nU(       aI  [.        R0                  " S5      nUR                  5        H  nU R3                  UR0                  U5        M!     U(       a  U R5                  [6        S[         35      O	[9        5       nU   U R                  UUSUUUU
UUUS9
n SSS5        U(       a  gU(       a^  [.        R0                  " S5      nUR                  5        H  nU R3                  UR0                  U5        M!     W R+                  [        5      n [;        U5      n![.        R<                  R?                  UW SS9  U
c  U(       d  U R3                  UU!SSS9  ggg! [         a   n[!        S	U S
[#        U5       35      UeSnAff = f! , (       d  f       N= f)a  
Tests FSDP training against a reference, which defaults to DDP but
may be customized with ``ref_init_fn``.

Args:
    model_class (Type[FSDPTestModel]): A model class that inherits from
        ``FSDPTestModel``, which defines the expected interface.
    fsdp_init_mode (FSDPInitMode): The mode to initialize the
        FSDP-wrapped model. This should not be ``NO_FSDP``.
    ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
        non-wrapped model to construct the reference model, where this
        wrapper should provide data parallel semantics. If ``None``,
        then the callable defaults to the DDP constructor.
z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)r  output_device)r  r  r  r  r  r  r  )r   rt  r#  r  rx  r  zInitializing z raised error r=   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)r  r  r  r  r  r  r  r  )check_dtypezFSDP did not match DDP)exact_devicemsg) rA   rH   rv   r   rk   rN   rP   r4   DDPr   r   r  r   r   r  	Exceptionr)  r   r%  r   rQ   r   r  rp   rZ   r0  assertRaisesRegexr  r   r   r  assert_close)"rY   r  r  r   r  r  r  r   rt  r#  r  rx  r  r  r  r  r  r   r  r   ru   	ref_modelref_loss
ddp_paramsr.  r  r  expects_device_errorexpects_cpu_device
cpu_devicer   context	fsdp_lossfsdp_unsharded_paramss"                                     rL   _test_fsdp_parityFSDPTest._test_fsdp_parityV  s   D !5!55 	
<	
5 K!!&&(    ((
 	

 
 x{m;	  4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y$))"" 	
 # J *d++ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0  z: 1 $ ""%%0M3  	 55!,% /+E++E 6 I    e,J#..0  z: 1![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF Ws$   "M 1M5
M2M--M25
N)rm  r   )r  NFNFFN)&rD   rE   rF   rG   rG  rY  r   rv   r4  rj  rn  rq  ru  ry  r.   classmethodr  rs   rt   r   floatr   r   r   r   r   r   r  r  rT   rA   rN   r   r   r   r  rJ   r5  r6  s   @rL   r[  r[    sI       : : d   0 0>JH3 4% 4%v 15 48+0#?CUyyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<Ux +/",,8<8<48!& %+0#04?C#g-(g %g )	g
 h'g g g  g $$45g $$45g ".1g g g %)g g  d38n-!g" %-T#s(^$<#g grK   r[  compile_compute_on_modulec                 D   ^ ^^ U 4S jm " S S[         5      mUU4S jnU$ )Nc                     > [         R                  R                  R                  " U 0 UD6  Tb  [	        U S   T5      (       a  U S   R                  5         g g )Nr   )rp   r  r  r   r%  rj  )rh   ri   r  s     rL   !fully_shard_with_compiled_compute=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sS    **D;F;$,
G.1
 1
 GOO1
rK   c                   0    \ rS rSr\" 5       r\" 5       rSrg)*compiled_fsdp_test.<locals>.FullyShardModei  rC   N)rD   rE   rF   rG   r   EAGERCOMPILED_COMPUTErJ   rC   rK   rL   FullyShardModer    s    6rK   r  c                 6   >^  [        T 5      UUU 4S j5       nU$ )Nc                    > [         R                  R                  R                  nT GH  nUTR                  :w  a&  [        5       (       d  [        R                  " SSS9  M:  [         R                  R                  R                  n[         R                  R                  R                  n[         R                  R                  5         UTR                  :X  a  UnO_UTR                  :X  aA  S[         R                  R                  l
        S[         R                  R                  l        TnO[        SU 35      eUT	R                   UR"                  '   T	" U 0 UD6  [         R                  R                  5         UT	R                   UR"                  '   U[         R                  R                  l
        U[         R                  R                  l        GM     g )Nz0Inductor on GPU needs Triton and recent GPU archr   )
stacklevelTr?   z!Need to implement FullyShardMode=)rp   r  r  r   r  r6   warningswarnrQ  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rD   )
rh   ri   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr  r  funcs
          rL   wrapper6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  sf   (-(9(9(>(>(J(J &>///
MMJWX +0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&6= 'rK   r
   )r  r  r  r  s   ` rL   	decorator%compiled_fsdp_test.<locals>.decorator  s#    	t 	R 
 	RD rK   )r	   )r  r  r  r  s   ` @@rL   compiled_fsdp_testr     s"    " "$L rK   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )
SkipModulei5  c                 X   > [         TU ]  5         [        R                  " SSSS9U l        g N
   Fr  )r   r   rs   r   linrI  s    rL   r   SkipModule.__init__6  s"    99R%0rK   c                 $    U R                  U5      $ r   r  rI  s     rL   r  SkipModule.forward:  s    xx{rK   r  rm   r  r6  s   @rL   r  r  5  s    1 rK   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )NestedLineari>  c                    > [         TU ]  5         U(       a8  [        [        R                  " SSSS9R                  [        5      5      U l        g [        R                  " SSSS9R                  [        5      U l        g r  )r   r   r!   rs   r   r   r   nested_linear)rY   	fsdp_wrapr  s     rL   r   NestedLinear.__init__?  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrK   c                 $    U R                  U5      $ r   r	  rI  s     rL   r  NestedLinear.forwardF  s    !!!$$rK   r  r  r6  s   @rL   r  r  >  s    O% %rK   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	SkipModeliJ  c                    > [         TU ]  5         [        R                  " SSSS9R	                  [
        5      U l        [        5       R	                  [
        5      U l        [        [        US9[
        S9U l        g )Nr   Fr  )r
  )r  )r   r   rs   r   r   r   linearr  linear_skipr!   r  r	  )rY   double_nestr  s     rL   r   SkipModel.__init__K  sW    iiBU366{C%<??;7!;/;
rK   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   r  r  r	  rI  s     rL   r  SkipModel.forwardS  s4    KKNQq!rK   r  r  r6  s   @rL   r  r  J  s    
 rK   r  )FT)FFr  )rC   r   )
contextlibr_  ra  r  r{  unittestr  abcr   r   collections.abcr   r   copyr   enumr   r	   	functoolsr   typingr   r   r   r   r   r   rp   torch.distributedr  r   torch.nnrs   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   r  *torch.testing._internal.common_distributedr,   r-   r.   r/   $torch.testing._internal.common_utilsr0   r1   r2   r3   r4   r5   torch.utils._tritonr6   rC  r   r  r8   r  r;   rA   rN   rt   rT   r3  r   r   r4  r   r   r   r   r   r   r   r   r   r8  rW  r[  ro  r  r  r  r  r@  r  r  contextmanagerr  r  r
  r  r  r  r  r   r$  r)  ro   r   r>  skipIfr@  r[  r  r  r  r  r  rC   rK   rL   <module>r5     s    	 	 
    # $ "    < <        4 4 
 ? S 
 I R R F F  F H   + K ::**,LK K 99))+LK L4 T BIIs 499$$ >% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 
X 
 
" 
X 
 
" 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	 Tyy T II T c3h	 TF 
45/  6&[# [|
2(4. 2j 	%299 	%		 rK   