
    ȅi4                     ~   S SK r S SKJr  S SKJr  S SKrS SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJrJrJrJrJrJrJr  S SKJ r J!r!  S SK"J#r#J$r$  S SK%J&r&  S SK'J(r(  S SK)J*r*J+r+J,r,  S SK-J.r.  S SK/J0r0  S SK1J2r2  S SK3J4r4  \5\6\7\\8   S-  \\8   4   4   r9S/r:S*S\8S\6S\64S jjr; S+S\Rx                  S-  S\4S jjr=S\R|                  S\?4S jr@ S*S\S\\8   S\6S\R|                  4S  jjrAS!\S\7\9\Rx                  S-  4   4S" jrB " S# S$\5      rC S+S%\S&\6S'\(S(\!S-  S\4
S) jjrDg),    N)Sequence)cast)_get_device_module)ShardedTensor)TensorProperties)Shard)ChunkShardingSpec)unflatten_state_dict)DefaultLoadPlanner)BytesStorageMetadataChunkStorageMetadataMetadataMetadataIndexSTATE_DICT_TYPEr   TensorStorageMetadata)LoadPlanLoadPlanner)_create_read_items create_read_items_for_chunk_list)load_state_dict)StorageReader)_element_wise_add_element_wise_sub_normalize_device_info)_get_default_group)_create_chunk_sharded_tensor)_remote_device)DTensor!load_sharded_optimizer_state_dictglobal_rankdevice_typereturnc                     US:X  a  g[        U5      nUR                  5       (       a  [        XUR                  5       -  5      $ g)Ncpu)r   is_availabler   device_count)r    r!   device_modules      `/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/checkpoint/optimizer.py_gen_rank_devicer)   8   sI    e&{3M!!##%}'A'A'CC
 	
     pgc                    [         R                  R                  U 5      R                  nU c>  [	        [         R
                  " 5       5       Vs/ s H  nSU S[        X!5       3PM     nnOM[	        U R                  5       5       Vs/ s H)  nSU S[        [         R                  " X5      U5       3PM+     nn[        S[        [        [        [        -     U5      S9$ s  snf s  snf )Nrank:/r   dim
placements)distdistributed_c10d_get_pg_default_devicetyperangeget_world_sizer)   sizeget_global_rankr	   r   listr   str)r+   pg_device_typeidxr1   s       r(   _create_colwise_specr>   C   s     **AA"EJJN	z T0023
3 C5*3?@A3 	 

 RWWY'
' C5*4+?+?+H.YZ[' 	 
 ^c12J? 


s   C0C#valc                    [        U 5      [        L a  [        U R                  5       5      S:X  a  g[        U R                  5       S   R                  5      [        L a  g[        U R                  5       S   R                  5      [
        L a  [        S5      e g[        U 5      [
        L aC  [        U R                  5      [
        L d  [        U R                  5      [        L a  [        S5      eg)Nr   FTz1Cannot handle DTensor nested inside ShardedTensorzCannot handle nested DTensor)r5   r   lenlocal_shardstensorr   
ValueError_local_tensor)r?   s    r(   _is_nested_tensorrF   W   s    CyM!s!"a'  "1%,,->  "1%,,-8PQQ 9 	 
cg	S7*d33D3D.E.V788r*   propsr8   c           	      P   US:X  a2  [        [        R                  [        U5      R	                  5       5      nO.[        R                  " U[        U5      R	                  5       5      n[        R
                  " UU R                  U R                  U R                  U R                  US9$ )Nr$   )r8   dtypelayoutrequires_grad
pin_memorydevice)
r   torchrM   r   current_deviceemptyrI   rJ   rK   rL   )rG   r8   r!   rM   s       r(   _alloc_tensorrQ   f   s     eell$6{$C$R$R$TU+K8GGI
 ;;kk||))## r*   
state_dictc                    0 nSnU R                  5        H  u  p4SUR                  5       4X'   [        U5      (       d  M+  [        UR	                  5       5      S:X  d  [        S5      e[        U[        5      (       d  [        S5      eUR	                  5       S   nUR                  R                  UR                  R                  4X'   UR                  R                  nM     UU4$ )a  
Load the right TP slice of the optimizer state.

This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
This is pretty fragile and it might be easier for FSDP to compute this info for us.
Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
(offset, size) for the current rank TP slice.
N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
N   z%Cannot handle ST with multiple shardsz$Can only handle nested ShardedTensorr   )itemsr8   rF   rA   rB   AssertionError
isinstancer   metadatashard_offsetsshard_sizesrC   _process_group)rR   specsdp_pgkeyvalueshards         r(   _get_state_dict_2d_layoutra   z   s     #%E&*E &&(
EJJL)
U##u))+,1$%LMMe]33$%KLL&&(+E,,**EJ LL//E ) 	 r*   c                      ^  \ rS rSr% \\\4   \S'   \\S'   \\S'   S\\	\
\   4   SS4U 4S jjrS\4S	 jrS
\S\R                   4U 4S jjrSrU =r$ )_ReaderWithOffset   translationrR   rX   fqn_to_offsetr"   Nc                 j   > [         TU ]  5         Xl        [        0 5      U l        0 U l        0 U l        g N)super__init__rf   r   rX   rR   re   )selfrf   	__class__s     r(   rj   _ReaderWithOffset.__init__   s.    * r*   c           	         / n0 U l         U R                  R                  5        GH  u  p#U R                  R                  U   n[        U[        5      (       d  U[        X$U5      -  nME  X R                  ;  a  U[        X$U5      -  nMe  U R                  U   n[        UR                  5       5      S:X  d  [        S5      eUR                  5       S   n[        [        R                  " [        UR                  R                   U5      5      [        R                  " UR                  R"                  5      S9/n[%        U['        [(        U5      U5      nU H  n	U	R*                  R,                  c  [        S5      e[/        U	R*                  R,                  U5      n
[0        R2                  " U	R*                  [        R                  " U
5      S9nXR                   U	R*                  '   M     X-  nGM     [5        U5      $ )NrT   z Expected exactly one local shardr   )offsetssizesz"dest_index.offset must not be None)offset)re   rR   rU   rX   state_dict_metadatarW   r   r   rf   rA   rB   rV   r   rN   Sizer   rY   rZ   r   r   r   
dest_indexrq   r   dataclassesreplacer   )rk   requestsfqnobjmdrq   original_shardlocal_chunksreqsrioriginal_offsetoriginal_indexs               r(   create_local_plan#_ReaderWithOffset.create_local_plan   s   --/HC2237Bc=11.s<<,,,.s<<'',Fs'')*a/$%GHH --/2N$!JJ).*A*A*O*OQWX  **^%<%<%H%HI	L 4T/4lD
 ==''/()MNN"3BMM4H4H&"Q!,!4!4MM%**_*E" 3A  /  HM 0N !!r*   indexc                 T   > [         TU ]  U R                  R                  X5      5      $ rh   )ri   lookup_tensorre   get)rk   r   rl   s     r(   r   _ReaderWithOffset.lookup_tensor   s$    w$T%5%5%9%9%%GHHr*   )rf   rX   rR   re   )__name__
__module____qualname____firstlineno__dictr   __annotations__r   r   r;   r   intrj   r   r   rN   Tensorr   __static_attributes____classcell__)rl   s   @r(   rc   rc      sm    m]233d3+=&> 4 *"8 *"XI= IU\\ I Ir*   rc   model_state_dictoptimizer_keystorage_readerplannerc                 8   UR                  5       n[        U 5      u  pV[        R                  R	                  U5      R
                  n[        U5      nUce  / n	[        [        R                  " 5       5       H6  n
[        XzUR                  5       -  5      nU	R                  SU
 SU 35        M8     [        SU	S9nO[        U5      n0 n0 nUR                  R                  5        GH  u  nnUR                   U   nUS   U:w  a  M!  [#        U[$        5      (       a  SX'   M<  UR&                  R)                  5       S:X  a%  [+        UR,                  UR&                  U5      X'   M  Ucl  [/        [+        UR,                  UR&                  U5      [        R0                  " 5       [        R                  " 5       UR                  5       [3        5       S9X'   M  US	   nUR5                  USUR&                  45      S   n[7        UR,                  R8                  UR,                  R:                  UR,                  R<                  UR,                  R>                  UR,                  R@                  S
9nURC                  [D        RF                  " U5      U5      n/ n[        R0                  " U5      nURH                   Hi  n[K        [L        URN                  5      RQ                  5       U:w  a  M1  UR                  [S        [+        UR,                  URT                  U5      US95        Mk     [V        RX                  " UUUS9nUU;   a(  UU   S   b  [K        [Z        [\           UU   S   5      X'   UX'   GM     [_        UUUb  [a        U5      OUS9  [c        XR                   5      nU$ )a3  
Load a state_dict in conjunction with FSDP sharded optimizer state.

This is the current recommended way to checkpoint FSDP.
>>> # xdoctest: +SKIP
>>> import torch.distributed.checkpoint as dist_cp
>>> # Save
>>> model: torch.nn.Model
>>> optim_params = model.parameters()
>>> optim = torch.optim.SGD(optim_params, lr=0.01)
>>> # Save
>>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
>>>     state_dict = {
>>>         "optimizer": FSDP.optim_state_dict(model, optim),
>>>         "model": model.state_dict()
>>>     }
>>>     dist_cp.save_state_dict(
>>>         state_dict=optim_state,
>>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
>>>         planner=dist_cp.DefaultSavePlanner(),
>>>     )
>>>
>>> # Load
>>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
>>>     model_state_dict = model_tp.state_dict()
>>>     checkpoint = {
>>>         "model": model_state_dict
>>>     }
>>>     dist_cp.load_state_dict(
>>>         state_dict=checkpoint,
>>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
>>>         planner=dist_cp.DefaultLoadPlanner(),
>>>     )
>>>     model.load_state_dict(checkpoint["model_state"])
>>>
>>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
>>>         model_state_dict,
>>>         optimizer_key="optimizer",
>>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
>>>     )
>>>
>>>     flattened_osd = FSDP.optim_state_dict_to_load(
>>>        model, optim, optim_state["optimizer"]
>>>     )
>>>
>>>     optim.load_state_dict(flattened_osd)
Nr-   r.   r   r/   z
<bytes_io>rT   )rank
world_sizenum_devices_per_noder+      )rI   rJ   rK   memory_formatrL   )rC   rX   )process_group)rR   r   r   )2read_metadatara   r2   r3   r4   r5   r   r6   r7   r   r&   appendr	   r>   rr   rU   planner_datarW   r   r8   numelrQ   
propertiesr   get_rankr   r   ShardTensorPropertiesrI   rJ   rK   r   rL   build_metadatarN   rs   shards_metadatar   r   	placementr   r   rZ   r   +_init_from_local_shards_and_global_metadatar   r   r   rc   r
   )r   r   r   r   rX   layout_specsr]   dp_pg_device_typer'   r1   idevice_infosharding_specrR   rf   r^   r_   key_pathspec_key
alloc_sizer   st_mdrB   current_rankshard_mdsts                             r(   r   r      sJ   j ++-H34DEL--DDUKPP&'89M}
t**,-A0!}'A'A'C#CK aS+78	 .
 *aJG,U3 #%J.0M2288:
U((-A;-'e122*JO ::"+  %**.?JO ]:e..

<MN]]_..0%2%?%?%A%'JO  {H%))(T5::4FGJJ.&&,,''..#..<<#..<< ++66J "00J1GTEL==/L!11(:(:;@@BlR##,!,,h.B.BDU  "*	 2 JJe5B <'L,B1,E,Q%)(3-h9OPQ9R%S" JOq ;v %494E!-07	 &j2G2GHJr*   )cudarh   )Eru   collections.abcr   typingr   rN   torch.distributeddistributedr2   torch._utilsr   +torch.distributed._shard.sharded_tensor.apir   0torch.distributed._shard.sharded_tensor.metadatar   r   -torch.distributed._shard.sharded_tensor.shardr   :torch.distributed._shard.sharding_spec.chunk_sharding_specr	   )torch.distributed.checkpoint._nested_dictr
   ,torch.distributed.checkpoint.default_plannerr   %torch.distributed.checkpoint.metadatar   r   r   r   r   r   $torch.distributed.checkpoint.plannerr   r   ,torch.distributed.checkpoint.planner_helpersr   r   .torch.distributed.checkpoint.state_dict_loaderr   $torch.distributed.checkpoint.storager   "torch.distributed.checkpoint.utilsr   r   r   "torch.distributed.distributed_c10dr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr   r   r;   tupler   STATE_DICT_2D_LAYOUT__all__r)   ProcessGroupr>   r   boolrF   rQ   ra   rc   r    r*   r(   <module>r      s    $     + E @ X J K   G K > 
 B L : , Cx}t';Xc]'J!KKL 
 (
# C S  $(D (5<< D   FL#+C=?B
\\(  
!2!2T!99: F:I* :IB #'	N%NN "N 4	N
 Nr*   