
    ȅiH                        S SK r S SKrS SKJr  S SKJrJr  S SKrS SKJ	s  J
r  S SKJ	s  Js  Jr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJrJ r J!r!J"r"J#r#  \ RH                  " \%5      r&S\S\'4S jr( " S S5      r) S,S\S\S\\!   S\*S\+\+\,S4   \+\,S4   4   4
S jjr-\S\,S\,S\,S\#\-  S\,S\*S\+\,\R\                  S-  4   4S j5       r/\S\R\                  S\,4S j5       r0 S,S\S\S \1\,   S-  S\\!   S\*S\+\+\,S4   \+\,S4   4   4S! jjr2\Rf                  Rh                  r5S"\R\                  S\S\\!   S\+\1\,   \1\,   4   4S# jr6S$\Rn                  S\S\\!   S\Rn                  4S% jr8S&\Rr                  Rt                  S'\\;   S\4S( jr<S)\S\S\\!   S\+\,S4   4S* jr=S\Rn                  4S+ jr>g)-    N)Sequence)Anycast)
LazyString)	ShapeType)maybe_run_for_local_tensor)
DeviceMesh)redistribute_cost)DTensorSpec)OpSchema)_StridedShardPartial	Placement	ReplicateShardschemareturnc                     SU  S3$ )Nz%Implicit redistribution occurred for z/ while ExplicitRedistributionContext was active )r   s    Y/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/tensor/_utils.py#_format_implicit_redistribution_msgr      s    26(:ijj    c                   |    \ rS rSrSr\R                  " 5       rSS\S\4S jjr	\
S\S\S\4S	 j5       rS
 rS rSrg)ExplicitRedistributionContext    a   
Within this context manager, DTensor will refuse to perform implicit redistribution,
instead raising an error.  Manual calls to ``redistribute()`` are required wherever a redistribution
must occur to avoid erroring.  This can be used to ensure that the user is aware of all redistribution.

Note: it is easier to use this mode on just the forward pass of a typical DTensor program, as the backwards pass
may contain implicit redistribution calls that are not visible to the user and difficult to replace with manual
calls.  Redistribution during backward can be made explicit by writing `autograd.Function`s that are no-op
during forward and perform a manual redistribution during backwards.

enable (bool) if False, disables the context manager. Can be used nested inside an enabled region.

strict (bool) if True, triggers on any redistribution.  If False, only triggers on redistributions that perform
communication.

mode (str) Determines what happens when ExplicitRedistributionContext triggers:
"raise": raises an exception, "warn" issues a warning
enablestrictc                 X    Xl         X l        US;  a  [        SU 35      eUS:H  U l        g )N)raisewarnzInvalid mode r   )_enable_strictRuntimeError_raise_on_redistribution)selfr   r   modes       r   __init__&ExplicitRedistributionContext.__init__6   s3    ((tf566(,%r   src_specdst_specredistribution_msgc                    [        U R                  SS 5      =n(       ao  SnUR                  (       a"  UR                  (       a  SnO[	        X5      S:*  nU(       d2  UR
                  (       a  [        U5      e[        R                  U5        g g g )N_activeTFr   )	getattr_localr!   r"   r
   r$   r#   loggerwarning)clsr)   r*   r+   instancealloweds         r   observe_redistribution4ExplicitRedistributionContext.observe_redistribution=   sv     szz9d;;8;G###G/CqHG44&'9::NN#56	  <r   c                 p    [        [        R                  SS 5      U l        U [        R                  l        U $ )Nr-   )r.   r   r/   _prevr-   )r%   s    r   	__enter__'ExplicitRedistributionContext.__enter__Q   s-    :AA9dS
7;%,,4r   c                 B    U R                   [        R                  l        g N)r8   r   r/   r-   )r%   exc_typeexc_valexc_tbs       r   __exit__&ExplicitRedistributionContext.__exit__V   s    7;zz%,,4r   )r!   r8   r$   r"   N)TFr   )__name__
__module____qualname____firstlineno____doc__	threadinglocalr/   boolr'   classmethodr   r   r5   r9   r@   __static_attributes__r   r   r   r   r       sc    & __F8t 8D 8 77 7 '	7 7&
Br   r   global_shapemesh
placementsskip_offset.c                     SnUR                  5       (       d  SU4$ [        XR                  UR                  5       X#5      $ )a  
Compute the local tensor shape and the global offsets into the original tensor
of a DTensor on its current global rank. This is useful for checkpointing purpose.

Example:
global_tensor = [[0,  1,  2,  3,  4], sharded on mesh (DP=2, TP=2) with (Shard(1), Shard(1))
                 [10, 11, 12, 13, 14]]

This table shows the return value of local_shape and global_offset for each rank.
(`local_tensor` is for illustration only).

Note how the first coordinate of global_offset is always 0, corresponding to tensor dim 0 being replicated.

Rank        local_tensor        local_shape     global_offset
-------------------------------------------------------------
0           [[0, 1],            (2, 2)          (0, 0)
             [10, 11]]

1           [[2],               (2, 1)          (0, 2)
             [12]]

2           [[3],               (2, 1)          (0, 3)
             [13]]

3           [[4],               (2, 1)          (0, 4)
             [14]]

Args:
    global_shape (ShapeType): The global shape of the DTensor.
    mesh (:class:`DeviceMesh`): The device mesh this DTensor is distributed on.
    placements (Sequence[:class:`Placement`]]): The placements of the DTensor.
    skip_offset (bool): If True, skip computing the global offsets and return an empty
        tuple for global_offset. This can improve performance when only the local shape
        is needed. Defaults to False.

Return:
    local_shape: the shape of the DTensor's _local_tensor on the current rank.
    global_offset: a tuple of offsets for each dimension of the global tensor shape,
    identifying how this shard fits into the global tensor in each dimension. If
    skip_offset is True, this will be an empty tuple.

r   )r   )_is_current_rank_part_of_mesh&_compute_local_shape_and_global_offsetshapeget_coordinate)rL   rM   rN   rO   empty_offsets        r   %compute_local_shape_and_global_offsetrV   Z   sE    ` L--//l##1jj$"5"5"7 r   curr_local_sizemesh_dim_sizerank	placementzero_global_offsetc                    U UUS.n[        U[        5      (       a  SUS'   UR                  " S0 UD6u  pU(       a  US 4$ US:X  a  U[        R                  " XUS-   5      4$ [        U[
        5      (       aF  [        U[        5      (       d1  [        U	[        5      (       d   e[        R                  " XU-   5      n
O-[        U	[        5      (       d   e[        R                  " U	5      n
Uc  X4$ XU
   4$ )N)rW   
num_chunksrY   Freturn_first_offsetr      r   )	
isinstancer   _local_shard_size_and_offsettorcharanger   intlisttensor)rW   rX   rY   rZ   previous_offsetsr[   rO   kwargs
shard_sizeshard_offsetsindexs              r   _get_shard_size_and_offsetsrl      s     +#F
 )]++(-$% ) F F P PJ4Q5<<(:QR<RSSS)U##Jy-,P,P-----]J,FG-....]+  E222r   offsetsc                     [        U S   5      $ )Nr   )rd   )rm   s    r   _get_first_offsetro      s    wqz?r   
mesh_shapemy_coordinatec           
         [        U 5      n0 n[        U5       H  u  px[        U[        [        45      (       d  M"  UR
                  n	X	   n
U	[        U5      :  d   SU	 S[        U5       35       eUR                  U	5      nUc   e[        XY   X   X'   UUU
U5      u  pXU	'   XU	'   M     U(       a  [        U5      S4$ S/[        U 5      -  nUR                  5        H  u  p[        U5      X'   M     [        U5      [        U5      4$ )a  
Suppose you have a full tensor with size global_shape, and you have sharded
it according to placements for mesh_shape.  This function returns, for a
specific coordinate my_coordinate in the device mesh:

    - The size of your local shard WITHOUT padding (i.e., if you have
      an uneven split, your size might be smaller than the other entries
      in your dim), and

    - Where the data for your shard begins, in the full tensor.

This function is fairly simple if your tensor is evenly sharded; the complication
is around uneven splits.  There is also some complication for handling StridedShard,
which changes the order you should apply sharding.

Args:
    global_shape (ShapeType): The global shape of the tensor.
    mesh_shape (ShapeType): The shape of the device mesh.
    my_coordinate (Optional[list[int]]): The coordinate of the current rank in the device mesh.
    placements (Sequence[Placement]): The placements of the DTensor.
    skip_offset (bool): If True, skip computing the global offsets and return an empty
        tuple for global_offset. This can improve performance when only the local shape
        is needed. Defaults to False.

Returns:
    tuple: A tuple containing:
        - local_shape (tuple[int, ...]): The shape of the local shard on the current rank.
        - global_offset (tuple[int, ...]): The offsets for each dimension identifying where
          this shard begins in the global tensor. If skip_offset is True, this will be an
          empty tuple.
Sharding dim  greater than tensor ndim r   r   )re   	enumerater`   r   r   dimlengetrl   tupleitemsro   )rL   rp   rq   rN   rO   local_shapeshard_dim_to_global_offsetsmesh_dimrZ   	shard_dimr[   rg   ri   rj   global_offsetglobal_offsetss                   r   rR   rR      s<   N |$K #%(4)e]%;<<MM	)43{++ 	
I;&@[AQ@RS	
+ 7::9E((($?" #%
!
 ",I1>I.)  5* [!2%%C#l++M%@%F%F%H!	#4^#D  &Iu]333r   global_tensorc           	         [        U R                  5       5      n[        U R                  5       5      n[        U5       GH  u  pVUR                  U5      nUR	                  5       (       a  [        [        U5      nUR                  S:  a  [        SU 35      eUR                  n	U	[        U5      :  d   SU	 S[        U5       SU S35       eX9   n
X-  S:X  d   SU
 SU 35       eX-  X9'   [        [        U5      5       H"  nX:w  d  M
  XK   XI   U-  :  d  M  XK   U-  XK'   M$     M  [        U[        [        45      (       a  GM  [        S	[        U5       S
35      e   X44$ )a'  
Compute the local size and stride of a DTensor from the given global tensor info.

For example, if we have a global tensor with size (4, 8, 4) and stride (32, 1, 8).
If the DTensor placements are [Shard(2)] and world_size is 2;
then the local size is (4, 8, 2) and stride is (16, 1, 8).

Args:
    tensor (:class:`torch.Tensor`):
        Global tensor which DTensor will distribute
    mesh (:class:`DeviceMesh`):
        Object which describes the mesh topology
        of devices for the DTensor.
    placements (Sequence[:class:`Placement`]):
        The attribute of the DTensor that describes its layout
        on the mesh topology.

Returns:
    local_shape: A List of int which specifies the size of the local tensor.
    local_stride: A List of int which specifies the stride of the local tensor.
r   zOShard placements should have negative dims normalized in the user-facing APIs: rs   rt   z for placement number .zGlobal dim z not divisible by mesh size zplacement type z not supported!)re   sizestrideru   is_shardr   r   rv   AssertionErrorrw   ranger`   r   r   r#   type)r   rM   rN   r{   local_strideidxrZ   rX   shard_placementr~   global_dim_sizeis               r   compute_local_tensor_infor     s   4 }))+,K,,./L#J/		#"5)4O""Q&$--<,=?  (++Is;// 	{*DSEUDV W((+uA//
 *4O"2a7 o..J=/Z7 &5%EK" 3|,-N$<+B]+RR&2o&FLO . I	7';<<i0AQRR= 0@ $$r   rS   c                   ^ ^^ [        T5      S:w  a  [        S5      e[        T5      TR                  :w  a%  [        S[        T5       STR                   S35      e[	        TS   [
        5      (       a  T $ [	        TS   [        5      (       a  [        U4S j5       nU" T 5      n[        TR                  5       5       Vs/ s H!  n[        R                  " XDR                  S9PM#     nn[        R                  " XdT5        [        UU 4S	 j5       nU" XF5      n[        T 5      n	XTS   R                   '   [        R"                  " U	5      $ [        S
[%        TS   5       S35      es  snf )a  
Compute the global size of a DTensor from the given local tensor shape,
the mesh and placements. Different from `compute_global_tensor_info`,
which assumes sharding is even, this util allgathers local shards' shapes
from all ranks and thus can support uneven sharding.
NOTE: Currently this function only supports 1D mesh.

Args:
    shape (:class:`torch.Size`):
        Shape of the local tensor
    mesh (:class:`DeviceMesh`):
        Object which describes the mesh topology
        of devices for the DTensor.
    placements (Sequence[:class:`Placement`]]):
        The attribute of the DTensor that describes its layout
        on the mesh topology.

Return:
    tensor_shape: Shape of the global DTensor.
r_   z>compute_global_tensor_shape only supports 1 placement for now.z/Expected one placement per mesh dim, but found z placements and z mesh dims.r   c                 T   > [         R                  " [        U 5      TR                  S9$ )Ndevice)rb   rf   re   device_type)rS   rM   s    r   _create_local_shape_tensor?compute_global_tensor_shape.<locals>._create_local_shape_tensorw  s    <<UD4D4DEEr   r   c                    > SnTS   R                   n[        [        T	5      5       Vs/ s H  oDU:w  d  M
  UPM     nnU HE  n[        R                  " X   Xe   5      (       d  [        S5      eUR                  5       nX'U   -  nMG     U$ s  snf )Nr   z?Non-sharded dimensions should have identical size across ranks.)rv   r   rw   rb   equalr#   tolist)
r{   gathered_shaped_tensorssharded_dim_sumr~   d
other_dimsshape_tensorshape_tensor_listrN   rS   s
           r   "_validate_and_compute_global_shapeGcompute_global_tensor_shape.<locals>._validate_and_compute_global_shape  s    O"1))I%*3u:%6I%6y.!%6JI 7{{;#:L<TUU&Y  %1$7$7$9!Y#?? !8 #" Js
   	BBzPlacement type z not supported.)rw   NotImplementedErrorndimr#   r`   r   r   r   r   r   rb   
empty_liker   funcolall_gather_inplacere   rv   Sizer   )
rS   rM   rN   r   r{   _r   r   r   rL   s
   ```       r   compute_global_tensor_shaper   Q  sq   . :!!L
 	
 :$))#Z))9$))KQ
 	

 *Q-++	JqM5	)	)	#	F 
$	F 17 499;'#
' [1C1CD' 	  #
 	!!"9M	#	# 
$	# =
 E{*9Z]&&'zz,''!d:a=12/B
 	
7#
s    (E&op_callargsc                 f   U H  n[        U[        R                  [        45      (       a  UR                  s  $ [        U[
        [        45      (       d  MS  [        U5      S:  d  Md  [        US   [        R                  [        45      (       d  M  US   R                  s  $    [        SU  S35      e)zy
Find the device mesh object from args.
It returns None if no mesh is found.
NOTE: we can optimize this search if needed
r   z+Cannot find device mesh from args for op : r   )	r`   dtensorDTensorr   device_meshre   ry   rw   
ValueError)r   r   args      r   try_find_mesh_from_argsr     s     cGOO[9::??"sT5M**C13q6GOO[#ABBq6%%%  B7)1M
NNr   global_stridec                   ^ ^ S/[        T 5      -  m[        U5       H{  u  p4UR                  5       (       d  M  [        [        U5      R
                  n[        [        T 5      5       H-  nT U   T U   :  d  M  TU==   UR                  U5      -  ss'   M/     M}     [        U U4S j[        [        T 5      5       5       5      $ )z
Compute the stride of a local tensor shard, given the global stride of the DTensor.
NOTE: Currently this function is assuming the DTensor is evenly shardable.
r_   c              3   :   >#    U  H  nTU   TU   -  v   M     g 7fr<   r   ).0r   r   stride_divisorss     r   	<genexpr>'compute_local_stride.<locals>.<genexpr>  s$      8Q1aOA..8Qs   )	rw   ru   r   r   r   rv   r   r   ry   )r   rM   rN   mesh_idxpr   jr   s   `      @r   compute_local_strider     s     cC..O ,::<<UA""A 3}-. #mA&66#A&$))H*==& / -  8=c->P8Q  r   c                 $   [        U [        R                  5      (       a  U $ [        U [        5      (       a  U /nOA[	        U 5      S:X  a'  [        U S   [
        5      (       a  [        U S   5      nO[        U 5      n[        R                  " U5      $ )z
Unify variable types of size argument to torch.Size
Acceptable types include:
    int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
    or torch.Size
r_   r   )r`   rb   r   rd   rw   r   re   )r   
torch_sizes     r   normalize_to_torch_sizer     sn     $

##$V
	TaJtAw99$q']
$Z
::j!!r   )F)?loggingrG   collections.abcr   typingr   r   rb   )torch.distributed._functional_collectivesdistributed_functional_collectivesr   torch.distributed.tensor._apirf   _apir   torch._loggingr   torch._prims_commonr   torch.distributed._local_tensorr   torch.distributed.device_meshr	   *torch.distributed.tensor._collective_utilsr
   &torch.distributed.tensor._dtensor_specr   #torch.distributed.tensor._op_schemar   (torch.distributed.tensor.placement_typesr   r   r   r   r   	getLoggerrB   r0   strr   r   rI   ry   rd   rV   Tensorrl   ro   re   rR   _C#_DTensor_compute_global_tensor_infocompute_global_tensor_infor   r   r   _ops
OpOverloadobjectr   r   r   r   r   r   <module>r      s     $   : : / / % ) F 4 H > 8  
		8	$k kS k7B 7B| 	77
7 #7 	7
 5c?E#s(O+,7t 333 3 }$	3 3 3 3t##$3 3B u||    O4O4O4 9t#O4 #	O4
 O4 5c?E#s(O+,O4d #XXII =%<<=%
=% #=% 49d3i 	=%@H
::H
'H
5=i5HH

ZZH
VOZZ""O*26*:OO*$.<DY<O
38_,"UZZ "r   