
    ȅij                     j   S SK r S SKrS SKrS SKrS SKJrJrJrJr  S SK	J
r
JrJrJr  S SKrS SKJr  S SKJs  Js  Jr  S SKJs  Js  Jr  S SKJs  Js  Jr  S SKJr  S SKJr  S SK J!r!  S SK"J#r#  S SK$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  S SK,J-r-J.r.J/r/J0r0  S S	K1J2r2  S S
K3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;  S SK<J=r=  S SK>J?r?  S SK@JArA  S SKBJCrC  \(       a  S SKDJErE  SrF S SKGJHrHJIrI  SrKSrL\M\R                  \R                  4   rO\\R                  \O-     rP\9R                  \0R                  \9R                  \0R                  \9R                  \0R                  \9R                  \0R                  \9R                  \0R                  0rV\9R                  \9R                  /rW\9R                  \9R                  4rX\ S_S\&S\PS\9S\=S-  S\!S-  S\&4S jj5       rY\S\&S\PS\!S\&4S j5       rZ\S\
S\[4S j5       r\\S\!S\[4S j5       r]\S\^S\R                  4S  j5       r_\S!\R                  S\^S\R                  4S" j5       r`S!\R                  S\^S\M\R                  \R                  4   4S# jra\ S_S\&S$\R                  S%\\R:                  R                     S-  S&\\R:                  R                     \\R:                  R                     -  S-  S\&4
S' jj5       rdS&\e\
   S(\[SS4S) jrf\S\&S$\R                  S*\g\R                     S+\^\R                  -  S-  S\&4
S, j5       ri\S\&S$\R                  S\&4S- j5       rj\S\&S\9S-  S.\8S-  S/\5S-  S0\[S1\[S2\^S3\^S\&4S4 j5       rk\S\&S\&4S5 j5       rl\S\&S6\4S7\[S\&4S8 j5       rm\S_S\&S\!S\&4S9 jj5       rn\S\&S\&4S: j5       roS$\R                  S;\e\R                     SS4S< jrp\S\&S=\R                  S+\^\R                  -  S-  S>\\R                  /S4   S-  S?\[S\&4S@ j5       rq\S\&S;\e\R                     S=\R                  4SA j5       rrSB\R                  SC\\R:                  R                     S-  S\g\R                     4SD jrs S_SB\R:                  R                  S%\g\R:                  R                     SE\\R:                  R                     S-  S\g\R:                  R                     4SF jjrtSB\R:                  R                  S%\g\R:                  R                     S\g\u   4SG jrvSB\R                  S\g\u   4SH jrwS$\R                  S*\g\R                     S+\^\R                  -  S-  SS4SI jrxS+\^\R                  -  S-  SJ\^SK\%S\R                  S-  4SL jryS$\R                  S*\g\R                     S%\g\R                     S\M\[\[4   4SM jrzSB\R                  S>\\R                  /S4   S%\g\R                     SS4SN jr{SB\R                  SO\R                  S-  S%\g\R                     SK\%4SP jr|SB\R                  S%\g\R                     S\e\R                     4SQ jr}S$\R                  S*\g\R                     SR\g\R                     SO\R                  S-  SS4
SS jrS;\e\R                     ST\e\R                     SO\R                  S-  SS4SU jrSV rS$\R                  S*\g\R                     SO\R                  S-  SJ\^SK\%S\R                  4SW jrS$\R                  S;\e\R                     S\R                  SS4SX jrSY\e\R                     SS4SZ jrS$\R                  S*\g\R                     S\\R                     4S[ jrS*\g\R                     SS4S\ jrS\94S] jrS\R                  S\GR                  4S^ jrg! \J a    SrF GNf = f)`    N)Callable	GeneratorIterableIterator)Anyno_type_checkOptionalTYPE_CHECKING)default_hooks)
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 F   Ub  Ub  [        S5      eU[        ;   nU(       a%  Uc  Uc  Uc  [        SU S35      e[        XU5      n O6U(       a  X@l        UR	                  SS9U l        OUb  UO	[        5       U l        U R
                  R                  5       U l        U R
                  R                  5       U l	        U R                  nU(       a  X`R                  R                  5       -  n[        R                  R                  U5      U l        X`R                  -  U l        U $ )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr+   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r*   r+   r,   r-   r.   is_hybrid_strategydata_parallel_world_sizes          \/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterC   Y   sA     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?kE !,"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// $8$8$=$=$?? ""AA$	
 
$ 	!#C#CC 
% L    c                    U(       aW  [        U5      (       a/  X l        UR                  SS9U l        UR                  SS9U l        O[        SUR                   35      eUc<  [        5       n[        X0R                  R                  5       5      u  pEX@l        XPl        O7[        U5      (       a  Uu  U l        U l        O[        S[        U5       35      e[        U R                  S9U l        U $ )Nr   r1      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r+   )"_is_valid_hybrid_shard_device_meshr6   r7   r;   r+   r3   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r*   r+   r.   default_groupintra_node_groupinter_node_groups         rB   r5   r5      s    -k::!, $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N//<<>.
* // *-88 9F5E!5GGKMGZF[] 
 ;**E LrD   c                 ~    [        U [        5      =(       a'    [        U 5      S:H  =(       a    [        S U  5       5      $ )N   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fN)
isinstancedistProcessGroup).0pgs     rB   	<genexpr>1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s      JMb
2t0011M   '))rX   tuplelenallrG   s    rB   rM   rM      s:     	=%( 	K!#	KJMJJrD   c                 N    [        U [        5      =(       a    U R                  S:H  $ )NrU   )rX   r   rI   )r.   s    rB   rH   rH      s    k:.H;3C3Cq3HHrD   num_devices_per_nodec                 6    [         R                  " U 5      u  pU$ )a5  
Return a process group across the current node.

For example, given each row is a distinct node:
0  1  2  3  4  5  6  7
8  9 10 11 12 13 14 15
This API would return an intra-node subgroup across
[0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
For example, rank 3 would get [0, 1, ..., 7].
)rY   new_subgroups)rd   intra_node_subgroup_s      rB   _init_intra_node_process_groupri      s     "//0DErD   global_process_groupc                 d   Sn[         R                  " U 5      n[         R                  " U 5      nXA-  n[         R                  " U 5      U-  n[	        U5       H?  n[	        U5       Vs/ s H	  oX-  -   PM     n	n[         R
                  " XS9n
Xv:X  d  M=  U
nMA     Uc  [        U S35      eU$ s  snf )al  
Return an inter-node process group where each contained rank has the same local rank.

For example, given each row is a distinct node:
0  1  2  3  4  5  6  7
8  9 10 11 12 13 14 15
This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
would get [5, 13].
N)ranksbackendz. expected to assign inter-node pg, but did not)rY   get_backendget_world_sizeget_rankrange	new_groupAssertionError)rj   rd   inter_node_pgsharding_backendr:   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rB   _init_inter_node_process_groupr|      s      M''(<=$$%9:J2IMM"67:NNM01
=B9=M!
=M!23=M 	 !
 nn#8S&M 2 oKL
 	
 !
s   'B-c                 .    [        U5      [        X5      4$ )a  
Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

This function can be used to initialize process groups for ``HYBRID_SHARD`` or
``_HYBRID_SHARD_ZERO2`` in FSDP.
This function assumes each node has an equal number of CUDA-enabled devices.
Returns:
    Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
)ri   r|   )rj   rd   s     rB   rJ   rJ      s      	'';<&';R rD   moduleignored_modulesignored_statesc                    Ub  Ub  [        S5      eS nUS LnU(       a  [        U5      n[        US5        O/ n[        Ub  [        U5      O/ S5        [        U5      S:  a'  [	        US   [
        R                  5      (       a  UnOUn[        X5      U l        [        UU R                  U5      U l
        [        UU R                  5      U l        U $ )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r3   list_check_ignored_statesra   rX   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r*   r~   r   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rB   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;;!41O1&JE/E
 #<#E LrD   r   c                    [        U 5      S:X  a  gU(       aj  [        S U  5       5      n[        S U  5       5      nU(       d>  U(       d6  [        U  Vs1 s H  n[        U5      iM     sn[        S9n[        SU 35      egg[        S U  5       5      (       d6  [        U  Vs1 s H  n[        U5      iM     sn[        S9n[        SU 35      egs  snf s  snf )	z
Check that the ignored states are uniformly parameters or uniformly modules.

We may remove this check in the future if we permit mixing.
r   Nc              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frW   )rX   r   r   r[   r*   s     rB   r]   (_check_ignored_states.<locals>.<genexpr>J  s     UnUE2<<88nr_   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frW   rX   r   Moduler   s     rB   r]   r   K  s     SN5*UBII66Nr_   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frW   r   r   s     rB   r]   r   T  s     L^E:eRYY//^r_   z>ignored_modules expects nn.Module list elements but got types )ra   rb   sortedrN   reprr3   )r   r   
all_paramsall_modulesr*   sorted_typess         rB   r   r   ?  s     >aUnUU
SNSS+!N"KN54;N"KQUVL**69  #.z L^LLL!N"KN54;N"KQUVL%(  M #L #Ls   C"Cignored_params	device_idc                 b   SnUb7  [        U[        R                  5      (       a  UO[        R                  " U5      nUc  [        X5       H  nUR                  R                  S;   a  M  Uc  UR                  nM0  UR                  R                  UR                  :w  d  MV  [        SUR                   SUR                  R                   35      e   U=(       d    [        R                  R                  5       nUR                  S:X  a  [        S5      e[        R                  " U5      U l
        U $ )a  
Determine device handle used for initializing FSDP.

If a device is specified by ``device_id``,
then returns device handle corresponds to that device type. Otherwise, If the
module is already on a non-CPU device, then the device type is that non-CPU device type.
If the module is on CPU or meta, then the device type is the current accelerator device.
See the :ref:`Accelerators<accelerators>` for details.


This method will be called once ignored parameters was determined, as the device handle maybe needed
for other initialization.
N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rX   torchdevice_get_orig_paramsrN   RuntimeError_C_get_acceleratorr   from_devicerK   )r*   r~   r   r   determined_deviceparams         rB   _init_device_handler   \  s   (  )U\\22 i( 	
  %f=E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^  > .L1J1J1L!!U*a  -889JKELrD   c                     [        U5      U l        0 nUR                  5        H  u  p4[        U5      nUR                  X#'   M      X l        U $ rW   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r*   r~   r   buffer_namebuffers        rB   _init_buffer_stater     sT    
 ,F3E
 :<%335'428,,"/  6 (B$LrD   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    U R                   S:X  aU  U[        R                  :w  a0  [        R                  " SU=(       d    [        R
                   S3SS9  [        R                  nO.U[        R                  :X  a  [        R                  " S[        SS9  U=(       d    [        R
                  U l        U=(       d
    [        5       U l	        Ub5  [        R                  R                  S[        U R                  5       35        [        R                  R!                  ["        S	5      S
:H  U l        U=(       d
    ['        5       U l        X@l        XPl        [.        R0                  U l        S U l        [7        5       U l        [:        R<                  " 5       U l        [@        RB                  " U R>                  UU5      U l"        S U l#        0 nXl$        S n	Xl%        / n
Xl&        U $ )NrF   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.rU   
stacklevelzoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   z'torch.distributed.fsdp.mixed_precision. 1)'r:   r   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr,   r   r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr   _free_event_queuerY   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r*   r,   r   r   r   r   r   r   r   r   r   s              rB   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E'' 	 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/,(--EEN-/E--/E-<<E
  E IK#,K) '+GM"$FLLrD   c                 `    / nXl         / nX l        / nX0l        SU l        S U l        S U l        U $ )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r*   r   r   r   s       rB   _init_runtime_stater     sE     8:&?#24!535"7 EE!ELrD   backward_prefetchforward_prefetchc                     Xl         X l        U $ rW   )r   r   )r*   r   r   s      rB   _init_prefetching_stater     s     0- LrD   c                     Ub  UR                  5       OS nU(       a+  X R                  :w  a  [        U R                  5      U l        U $ S U l        U $ rW   )_get_root_meshr6   r#   rK   _fsdp_extension)r*   r.   	root_meshs      rB   _init_extensionr      sR    
 1<0G**,TI y$6$66 1%2F2F G
 L !%LrD   c                 ~    [         R                  U l        [        5       n[	        5       U l        Xl        0 nX l        U $ rW   )r!   FULL_STATE_DICT_state_dict_typer   r   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r*   state_dict_configunshard_params_ctxs      rB   _init_state_dict_stater     s;    *::E)<)>%=%?E"057 2LrD   r   c                     U H`  n[        UR                  5      S:X  d  M  SnU R                  5        H  u  pEX%L d  M  Un  O   U(       d  [        S5      e[	        SU S35      e   g)z
Verify if the parameters are accepted by FSDP. The only restriction now
is that the parameter cannot be a scalar tensor (param.shape == []).
r   r   zExpected param_name to be setz/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)ra   shapenamed_parametersrs   r3   )r~   r   r   
param_namenameparam_s         rB   _verify_managed_paramsr     sw    
 u{{q J & 7 7 9?!%J !: $%DEE$%KM  rD   fully_sharded_moduleparam_init_fnsync_module_statesc                   ^  [        UT R                  U5        [        UT R                  T R                  5      n[        UT R                  T R                  5      u  pgU(       d  U(       a  Ub  [        XT R                  5        OJU(       a#  [        UUT R                  T R                  5        O U(       a  [        R                  " UU 4S jS9  T R                   VV	s1 s H  nUR                  5         H  n	U	iM     M     n
nn	[        UT R                  U
U5        [        UT R                  UT R                  T R                  5      T l        [        [!        UT R                  5      5      n[#        X5        U(       a@  [%        XT R&                  5        T R(                  [*        ;   a  [%        XT R,                  5        [/        T X5        T $ s  sn	nf )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 J   > [        U 5      S L =(       a    U TR                  ;  $ rW   )r   r   )	submoduler*   s    rB   <lambda>0_init_param_handle_from_module.<locals>.<lambda>P  s(    '=i'HD'P (8!7!77(8rD   )check_fn)_check_single_device_moduler   _get_device_from_device_idr8   rK   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler'   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   _sync_module_params_and_buffersr+   r,   r4   r;   _init_param_handle_from_params)r*   r   r   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rB   _init_param_handle_from_moduler  1  s      4e6K6KYW65::u33 3Ne33U5K5K3/N 	5=;T' 1G1G	
 
  ""  		
 
%(( 8	
 $444N$,,.F 	. 	4   	 /

E *+?AVAVWXN/@' %2E2E	
 ""&@@+$e6J6J #5.OL?s   $Gc                    [        U5      S:X  a  g [        UUU R                  [        U R                     U R
                  R                  U R                  R                  U R                  R                  U R                  R                  U R                  U R                  U R                  S9nUR                  5         U R                  (       a  [!        S5      eU R"                  R%                  UR&                  5        X0l        X0R(                  UR*                  '   [,        R.                  " S5      nU R
                  R                  (       a-  UR&                  R.                  U:w  a  UR1                  U5        g g g )Nr   )fsdp_extensionz!Expected state._handle to be Noner   )ra   r   r  SHARDING_STRATEGY_MAPr,   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr+   r   r   shardr   rs   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r*   r   r   handle
cpu_devices        rB   r  r  v  s'    6{ae556(())**66,,F LLN}}@AA	LL))*MJP))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rD   root_moduler   c           	         Sn Ub  [        U5      O	[        5       nU Ha  n[        U[        R
                  R                  5      (       d  [        US[        U5       3-   5      e[        U5      (       d  MX  [        S5      e   U R                  5        H1  n[        R                  " U5      (       a  M   UR                  U5        M3     U VVs1 s H>  nUR                  5         H&  n[        U[        R                  5      (       a  M$  UiM(     M@     nnnX;   a  [        R                   " SW 3SS9  U R                  5        HJ  n[        U5      n	U	c  M  [#        U	S	5      (       d  [%        S
5      eUR'                  U	R(                  5        ML     U$ ! [         a   n[        US[        U5       3-   5      UeSnAff = fs  snnf )aP  
Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

Return the modules contained in their module
subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
already-computed ignored modules are included.

``_ignored_modules`` represents the argument passed by the user to FSDP.
z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: rU   r   r   z?Expected optional_fsdp_state to have _ignored_modules attribute)set	TypeErrorrN   rX   r   r   r   r   r3   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   hasattrrs   updater   )
r'  r   
msg_prefixignored_root_moduleser~   childr   r  optional_fsdp_states
             rB   r   r     s    RJQ%5%AC !su 	
 '&%((//22J+DT&\N)SSTT!&)) PQQ ' %%'**622 $$V, ( +*F^^%E%!C!CD 	% 	*   %228; 		
 !((*	4Y?*.0BCC$U  ""#6#G#GH + Q  Q
x5E0F/G%HHIqPQ$s#   F 7F?
F?
F<F77F<r   c                    [        5       nU VVs1 s H.  oDR                  5         H  n[        U5      (       a  M  UiM     M0     nnnUR                  U5        Ub5  U Vs1 s H  n[        U5      (       a  M  UiM     nnUR                  U5        U R	                  5        HJ  n[        U5      n	U	c  M  [        U	S5      (       d  [        S5      eUR                  U	R                  5        ML     U$ s  snnf s  snf )z
Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

:class:`FlatParameter` s are excluded from the result.
r   z>Expected optional_fsdp_state to have _ignored_params attribute)	r)  
parametersr   r2  r+  r   r1  rs   r   )
r'  r   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr  r7  s
             rB   r   r     s     36% #!"aLLNqBTUVBWN?  ! 78%)(
)!1CA1FA) 	% (
 	!!">? !((*	4Y?*.0ABB$T  %%&9&I&IJ + -!(
s   'C1
C1 C78C7c           	         [        5       nU VVs1 s H  o3R                  5         H  oDiM     M     nnnUR                  U R                  5        VVs1 s H  u  pdXE;   d  M  [	        U5      iM     snn5        U R                  5        HJ  n[        U5      nUc  M  [        US5      (       d  [        S5      eUR                  UR                  5        ML     U$ s  snnf s  snnf )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   zDExpected optional_fsdp_state to have _ignored_buffer_names attribute)
r)  r  r2  r   r   r+  r   r1  rs   r   )	r'  r   all_ignored_buffer_namesr;  r   buffers_in_ignored_modulesr   r  r7  s	            rB   r   r     s    
 *- ("'1iikFk  " ## (3'@'@'B	
'B#3 +k*'B	
 !((*	4Y?*.0GHH$Z  %++,?,U,UV + $#-"
	
s   "CC
"C
c                 j    U R                  5        VVs1 s H  u  p[        U5      iM     snn$ s  snnf )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r'  r   rh   s      rB   r   r     s8     >I=V=V=X=X>;+&=X  s   /c                     [        X5       Vs1 s H  o3R                  iM     nn[        U5      S:X  a)  [        R                  " S5      U;   a  Uc  [	        S5      eg[        U5      S:  a  [	        SU 35      egs  snf )z
Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

Thus, after this method, the
module must be either fully on the CPU or fully on a non-CPU device.
rU   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rF   z;FSDP only supports single device modules but got params on )r   r   ra   r   r   )r~   r   r   r   devicess        rB   r  r    s     *:&)QR)Q||)QGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A;r8   device_handlec                 r   U c  g[        U [        R                  5      (       a  U O[        R                  " U 5      nUR                  S:w  ak  UR                  c^  [
        R                  " SU  SU SUR                  5        SUR                   S3	SS	9  [        R                  " UR                  5       5      nU$ )
z
Return a ``torch.device`` for the specified ``device_id``.

Processes ``device_id`` and returns either the corresponding device or
``None`` if ``device_id`` is ``None``.
Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.rU   r   )rX   r   r   rN   indexr   r   current_device)r   r8   rE  r   s       rB   r  r  =  s     	5<<88	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11 	
 m::<=MrD   c                 2   [        [        X5      5      n[        S U 5       5      nU R                  5        H.  nXR;   a  M
  UR	                  SS9 H  nXFR
                  -  nM     M0     U(       + =(       a    [        =(       a    [        S U 5       5      nXG4$ )z
Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

At most of the returned bools can
be ``True``. If either is ``True``, then ``module`` needs to be
materialized.
c              3   8   #    U  H  oR                   v   M     g 7frW   )is_metar[   r   s     rB   r]   ._need_to_materialize_module.<locals>.<genexpr>h  s     CN5Ns   Frecursec              3   N   #    U  H  n[         R                  " U5      v   M     g 7frW   )r(   is_fakerL  s     rB   r]   rM  t  s     @U##s   #%)r   r   anyr+  r  rK  _TORCHDISTX_AVAIL)r~   r   r   r  r  r  bufr  s           rB   r  r  [  s     *6BCNCNCCN ^^%	'$$U$3Ckk)N 4 &  	A	A@@@  
 66rD   c                     [        U5      (       d  [        SU S[        U5       35      e[        X5      nU H  nU" U5        M     g )Nz	Expected z to be callable but got )callabler3   rN   _get_modules_to_materialize)r'  r   r   modules_to_materializer~   s        rB   r	  r	  y  sR    
 M""&>tM?R>ST
 	
 9V(f )rD   r  c           	      P   U=(       d$    [         R                  " UR                  5       5      n[        X5      nS n [         R                  " 5          U Ht  n[
        R                  " UR                  SS9UR                  SS95      n[        [        U5      5      S:  nU(       d  MT  UR                  USS9  UR                  5         Mv     S S S 5        g ! , (       d  f       g = f! [         a5  n	[        R                  " S[!        U	5       S[#        U5       S3SS	9  U	eS n	A	ff = f)
NFrN  r   )r   rO  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.rU   r   )r   r   rH  rW  no_grad	itertoolschainr9  r  ra   r   to_emptyreset_parametersBaseExceptionr   r   r   rN   )
r'  r  r   rE  materialization_devicerX  r~   module_state_iterhas_module_statesr5  s
             rB   r
  r
    s    3 ell$$&7 9VF ]]_0 %.OO%%e%4NN5N1%!
 %(->(?$@1$D!$$OO+A5OQ++- 1 __  !!$Q )L>!KM 		
 s<   C& AC($CC& 
C#C& #C& &
D%00D  D%c                 J   / n[         R                  " U /5      nU 1nU(       a  UR                  5       nUR                  U5        UR	                  5        HA  nXd;  d  M
  [        U5      b  M  Xa;  d  M  UR                  U5        UR                  U5        MC     U(       a  M  U$ rW   )collectionsdequepopleftr!  childrenr   r.  )r'  r   rX  queuevisited_modulesr~   child_modules          rB   rW  rW    s    
 /1{m,E'2mO
%%f-"OO-L3*<8@ 7##L1\* . % "!rD   r  c                   ^ [         R                  " S5      mUGb"  [        R                  " 5       nUR	                  U 5        / n/ nU(       a  UR                  5       nUR                  U4S jUR                  SS9 5       5        UR                  U4S jUR                  SS9 5       5        UR                  5        H5  n[        U[        R                  5      (       a  M$  UR	                  U5        M7     U(       a  M  U V	s/ s H  oU;  d  M
  U	PM     n
n	U V	s/ s H  oU;  d  M
  U	PM     nn	[        XU5        g[        [        X5      S5      nUb  UR                  T:X  a  [!        5         gggs  sn	f s  sn	f )a  
Move ``module`` depending on ``device_from_device_id`` and its current device.

This includes moving ignored modules' parameters.

- If ``device_from_device_id`` is not ``None``, then this moves
``module`` to the device.
- If ``device_from_device_id`` is ``None``, then this does not move
``module`` but warns the user if it is on CPU.

Precondition: ``_check_single_device_module()``.
r   Nc              3   L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frW   r   )r[   r   r&  s     rB   r]   )_move_module_to_device.<locals>.<genexpr>  s&      BE<<:- B   $	$FrN  c              3   L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frW   rm  )r[   r   r&  s     rB   r]   rn    s&      @F==J. @ro  )r   r   rd  re  r!  rf  extendr9  r  rg  rX   r/  r0  _move_states_to_devicenextr   _warn_cpu_init)r~   r   r  r  rh  r   r  curr_moduler  r<  params_to_movebufs_to_mover   r&  s                @rB   r  r    sV   $ e$J( /:.?.?.AV%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113	!)Y-O-OPPLL+ 4! e& &,GV/F!VG#*G7a.F7G~=RS!&94@EU\\Z7 8 HGs   	E3E3	E8)E8r  c                 \   [        U 5      S:X  a  [        U5      S:X  a  g[        U 5      S:  a  U S   R                  nO[        U5      S:  a  US   R                  n[        R                  " S5      nUb  U  Hn  n[        R                  " 5          UR	                  U5      Ul        UR                  b*  UR                  R	                  U5      UR                  l        SSS5        Mp     U H  nUR	                  U5      Ul        M     gWU:X  a  [        5         gg! , (       d  f       M  = f)z
Move states to the specified device.

Precondition: ``_check_single_device_module()`` and module's parameters and
buffers have been materialized if needed.
r   Nr   )ra   r   r   rZ  todatagradrt  )r   r  r  rH  r&  r   r   s          rB   rr  rr    s     6{aCLA-
6{Q))	W	 **e$J( E"XX&;<
::)&+jjmm4I&JEJJO ! 
 F ))$9:FK 	:	% 
& !s   AD
D+	c                  .    [         R                  " SSS9  g )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.rU   r   )r   r    rD   rB   rt  rt    s    MM	1 rD   c                    [        [        X5      S5      nUb'  UR                  R                  S:w  a  UR                  nO$[        R                  " UR                  5       5      nUb  Xb:w  a  [        SU SU SU 35      eU$ )a  
Determine and return this FSDP instance's compute device.

If the module is already on a non-CPU device, then the compute device is that non-CPU
device. If the module is on CPU, then the compute device is the current
device.

Since this method should be called after materializing the module, any
non-CPU device should not be meta device. For now, the compute device is
always a CUDA or CUDA-like device with its explicit index.

Precondition: ``_check_single_device_module()`` and
``_move_module_to_device()``.
Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )rs  r   r   rN   r   rH  r3   )r~   r   r  r8   rE  r   r  s          rB   r  r  %  s    * !&94@EU\\..%7m&B&B&DE(^-TB4&d#8"9;
 	
 rD   c                    / nU R                  5        H  n[        U[        S5      (       a  M  [        U[        S5        UR	                  5       n[        U5      (       a@  UR                  5       u  pgU Vs/ s H  n[        XX5      PM     n	nUR                  U	5        M  UR                  U5        M     U Ht  n
U
R	                  5       n[        U5      (       a@  UR                  5       u  pgU Vs/ s H  n[        X5      PM     nnUR                  U5        Mc  UR                  U5        Mv     [        U5        [        UU[        SS9  gs  snf s  snf )z
Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

Precondition: ``sync_module_states == True`` and ``self.process_group`` has
been set.
FTr   )srcN)r  getattrFSDP_SYNCEDsetattrdetachr%   __tensor_flatten__rq  r!  +_check_module_states_for_sync_module_statesr$   PARAM_BROADCAST_BUCKET_SIZE)r~   r   r+   module_statesr   detached_bufferattrsrh   attrinner_buffersr   detached_paraminner_paramss                rB   r  r  H  s#    )+M.."v{E22FK.$mmoO,_== +==?LQ RED!?E R$$]3$$_5 # (88%88:HEFKLedGN9eLL  .  0  0>#	 !S Ms   5E
/Er  c                 X    U (       a#  [        S U  5       5      (       a  [        S5      eg g )Nc              3   f   #    U  H'  oR                   [        R                   " S 5      :H  v   M)     g7f)r   N)r   r   )r[   tensors     rB   r]   >_check_module_states_for_sync_module_states.<locals>.<genexpr>w  s#      ;He,,=s   /1zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rR  r3   )r  s    rB   r  r  t  s<      ;H   C
 	
}rD   c              #      #    U R                  5       n  [        U5      nX1;  a  [        U5      (       d  Uv   M&  ! [         a     gf = f7f)a,  
Return an iterator over the original parameters in ``module``.

The iterator does not return
the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
present due to nested FSDP wrapping), or any original parameters already
flattened (only relevant when ``use_orig_params=True``).
N)r9  rs  r   StopIteration)r~   r   	param_genr   s       rB   r   r     sT      !!#IOE*3Ee3L3L   s    A'; 
AAAAc           	          [        U 5       HH  u  p#X1;  d  M  [        U5      (       a  M  [        SU SUR                  5        SUR                   35      e   g)a  
Check that original parameters in ``fsdp_module`` have been flattened.

The flattened parameters are made
invisible to ``named_parameters()`` for the module hierarchy rooted at
``fsdp_module``. This should be called as a sanity check after flattening
the wrapped module's parameters.
z Found an unflattened parameter: z;  N)r   r   r   r9   	__class__)fsdp_moduler   r   r   s       rB   _check_orig_params_flattenedr    sY     ?{K
&/A%/H/H2:,b::<.%//!24  LrD   c                 j    U [         R                  :X  a  [        R                  $ [        R                  $ rW   )r   r   r   allreduce_hookreduce_scatter_hook)r,   s    rB   _get_default_comm_hookr    s3      0 9 99 	$$ ..rD   c                 *    [         R                  " U S9$ )NrG   )r   r<   rG   s    rB   rO   rO     s     %%MBBrD   rW   )rd  r[  r   r   collections.abcr   r   r   r   typingr   r   r	   r
   r   torch.distributeddistributedrY   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr,  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr/  torch.nnr   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr   torch.distributed.fsdp.apir   r   r   r   r   r   r    r!   torch.distributed.fsdp.wrapr"   &torch.distributed.tensor.parallel.fsdpr#   torch.distributed.utilsr$   torch.utils._python_dispatchr%   torch.utils.hooksr&   rS  
torchdistxr'   r(   ImportErrorr  r  r`   rZ   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r4   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrC   r5   boolrM   rH   intri   r|   rJ   r   r   r   r   r   r)  r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r  r  r	  r
  rW  Tensorr  rr  rt  r  r  r  r   r  r  r<   rO   r}  rD   rB   <module>r     s
     	  C C > >    C C A A F F  B 4 A    B	 	 	 0 D < F 1 . 0 #D$5$5t7H7H$HI D--0KKL 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  &*00#0 (0 dN	0
 d"0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   "++"" 
" "J++ 4d///0&  ++II+ ehhoo.5+ UXX//0uxx 
	+ + +\I9=	: --II- %- U\\!D(	-
 - -` II  " ??'$.? $d*? d"	?
 ? ? !?  ? ? ?D    		'	 	 		 	 : J *   *   299 d2<<6H T ( AA))A U\\!D(A RYYK-.5	A
 A A AH ))) ))) )>::uxx/$6: 	^:@ ?C"")" !!3!34t;" 				"J$$)$ 	X$@299 S 
II
%
 U\\!D(
 
	
<U\\!D(
 % \\D	<7II7%7 ^7 4:	7<RYYK-. ^ 
	## <<$.# ^# %	#L""-0^"	"))_",3II3%3 &3 !<<$.	3
 
3l%,, !<<$. 
	@	II% !<<$. 	
 % \\F)II)) $$) 
	)X

%

	

II% bll,% 
(.> C$$CCw#  s   ^& &^21^2