
    ȅi$                        % S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
  S SKJrJr  S SKrS SKJs  Jr  S SKJs  Js  Jr  S SKJr  S SKJr  S SKJrJrJrJr  S S	KJ r   S S
K!J"r"J#r#J$r$J%r%J&r&  S SK'J(r(  \RR                  " \*5      r+Sq,\-S-  \.S'   \ R^                  S'S\-4S jj5       r0 " S S\5      r10 r2\3\4\Rj                  \    \4   S4   \.S'   S\ S\SS4S jr6S(S jr7 " S S5      r8 S)S\S\S\-S-  S\9\1   4S jjr:\
 S)S\S\S\-S-  S\9\1   4S jj5       r;SSS.S \Rx                  S!\S"\S#\-S\-S-  S\Rx                  4S$ jjr= " S% S&\R|                  R~                  5      r@g)*    N)defaultdict)Sequence)cache)cast
NamedTuple)_are_we_tracingone_step_redistribute_cost)DTensorSpec
ShardOrderShardOrderEntry
TensorMeta)
DeviceMesh)_StridedShardPartial	Placement	ReplicateShard)get_active_debug_mode#_FORCE_MIN_COST_REDISTRIBUTION_PLANenabledc              #   8   #    [         nU q  Sv   Uq g! Uq f = f7f)u  
Context manager to control the redistribution planning strategy for DTensor operations.

This context manager allows you to choose between two algorithms for computing the
sequence of collective operations needed to redistribute a DTensor from one placement
to another:

- **Graph-based**: Uses Dijkstra's algorithm to find the minimum-cost path
  through all possible placement transformations. This approach considers the global
  cost of all collective operations and finds the optimal sequence. Best for complex
  redistribution patterns where reducing communication cost and memory overhead is critical.

- **Greedy**: Uses a heuristic approach that makes locally optimal choices
  at each step. This is faster to compute but may not produce the globally optimal
  transformation sequence. Best for simple redistribution patterns or when planning
  speed is more important than optimal communication.

**Default Behavior (without this context manager):**

When this context manager is NOT used, the algorithm selection follows this priority:

1. **Non-default shard orders**
   → Always use graph-based algorithm (required for correctness)

2. **Explicit `use_graph_based_transform` parameter** to `_gen_transform_infos_non_cached`
   → Use the specified algorithm (True = graph-based, False = greedy)

3. **No explicit parameter** (default case)
   → Use greedy algorithm for faster planning

**Behavior with this context manager:**

This context manager overrides the default selection by setting the global flag
`_FORCE_MIN_COST_REDISTRIBUTION_PLAN`, which takes precedence over the explicit
`use_graph_based_transform` parameter (but not over non-default shard order requirements).

**Cache Considerations:**

The redistribution planner caches transform info for performance via the `@cache`
decorator on `_gen_transform_infos`. If you need to change the algorithm selection
for the same input specs, clear the cache using `_gen_transform_infos.cache_clear()`
to ensure the new setting takes effect and doesn't reuse cached results from a
previous run.

Args:
    enabled (bool): If True, forces the use of the graph-based algorithm.
                   If False, forces the use of the greedy algorithm.
                   Default: True
N)r   )r   	old_values     `/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/tensor/_redistribute.py use_min_cost_redistribution_planr   ,   s'     j 4I*1'8.7+i+s   	 c                   D    \ rS rSr% \\S'   \\\4   \S'   \\   \S'   Sr	g)_TransformInfoi   mesh_dimsrc_dst_placementslogical_shape N)
__name__
__module____qualname____firstlineno__int__annotations__tupler   list__static_attributes__r"       r   r   r   i   s!    Mi2339r,   r   DTensorRedistributePlanner_planner_cachedevice_meshdtensor_metareturnc                     [        5       (       a  [        X5      $ [        R                  " U 5      U4nU[        ;  a  [        X5      nU[        U'   [        U   $ )a  
Factory function to get or create a DTensorRedistributePlanner instance.
This function provides transparent caching of planner instances based on
device mesh and dtensor meta. Multiple calls with the same parameters
will return the same cached instance for better performance.
Args:
    device_mesh: The device mesh for the planner
    dtensor_meta: TensorMeta of the DTensor to redistribute
Returns:
    A DTensorRedistributePlanner instance (potentially cached)
)r   r-   weakrefrefr.   )r/   r0   	cache_keyplanners       r   get_redistribute_plannerr7   w   sR     )+DD[)<8I&,[G$+y!)$$r,   c                  ,    [         R                  5         g)z8Clear the cache of DTensorRedistributePlanner instances.N)r.   clearr"   r,   r    clear_redistribute_planner_cacher:      s    r,   c                      \ rS rSrSr\R                  " SSS9 " S S5      5       rS r\	S\
\\\   4   S	\4S
 j5       r\	S\S	\
\\\   4   4S j5       r\	 S&S\S\\   S\\S4   S\S-  S	\4
S jj5       rS\S\S	S4S jr  S'S jrS\\S4   S\S	\
S\4   4S jrS\S\S	\S   4S jrSSS\S\\S4   S	\\   4S  jrS!\S"\S\\S4   S	\\   4S# jrS!\S"\S	\\   4S$ jr S%r!g)(r-      ac  
This class is used to plan the collective calls to transform the local shard
of the DTensor from its current spec to the target spec.
Suppose there are N tensor dimensions and M mesh dimensions, the total
possible state size will be (N+2)*M*M!.
Note: Use get_redistribute_planner() factory function instead of direct
instantiation for automatic caching.
T)frozenslotsc                       \ rS rSr% \\S4   \S'   \\S'   \R                  " SSSSS9r
\S-  \S'   S	 rS
 rS rS\4S jrS\4S jrS\S\4S jrSrg)$DTensorRedistributePlanner.DistState   .
placementstensor_dim_to_mesh_dimNF)defaultinitreprcompare_hashc                 X    [         R                  " U R                  U R                  5      $ N)r   format_shard_order_strrB   rC   selfs    r   __str__,DTensorRedistributePlanner.DistState.__str__   s%    55++ r,   c                 "    U R                  5       $ rJ   )rN   rL   s    r   __repr__-DTensorRedistributePlanner.DistState.__repr__   s    <<>!r,   c                 N    [         R                  U SU R                  5       5        g )NrH   )object__setattr___compute_hashrL   s    r   __post_init__2DTensorRedistributePlanner.DistState.__post_init__   s"    ""$r,   r1   c                 T    U R                   b  U R                   $ U R                  5       $ rJ   )rH   rV   rL   s    r   __hash__-DTensorRedistributePlanner.DistState.__hash__   s#    !%!74::QT=O=O=QQr,   c                 D    [        U R                  U R                  45      $ rJ   )hashrB   rC   rL   s    r   rV   2DTensorRedistributePlanner.DistState._compute_hash   s$    OO// r,   otherc                     [        U[        R                  5      (       d  gU R                  UR                  :w  a  gU R                  U R
                  4UR                  UR
                  4:H  $ )NF)
isinstancer-   	DistStaterH   rB   rC   )rM   r_   s     r   __eq__+DTensorRedistributePlanner.DistState.__eq__   sd    e%?%I%IJJzzU[[(++   ,, r,   r"   )r#   r$   r%   r&   r)   r   r(   r   dataclassesfieldrH   r'   rN   rQ   rW   rZ   rV   rT   boolrc   r+   r"   r,   r   rb   r@      sw    )S.)) **'--u5%
sTz 	
		"		Rc 	R	3 		 	4 	r,   rb   c                 j   ^  [        U[        [        -  5      (       a  [        U 4S jU 5       5      $ U$ )z<Convert a nested list structure to a nested tuple structure.c              3   F   >#    U  H  nTR                  U5      v   M     g 7frJ   )	_to_tuple).0itemrM   s     r   	<genexpr>7DTensorRedistributePlanner._to_tuple.<locals>.<genexpr>   s     <!$--!s   !)ra   r*   r)   )rM   xs   ` r   rj   $DTensorRedistributePlanner._to_tuple   s*    a&&<!<<<r,   ro   r1   c                 T    [        S [        U R                  5       5       5       5      $ )zConvert dict to ShardOrderc              3   ^   #    U  H#  u  pU(       d  M  [        U[        U5      S 9v   M%     g7f))
tensor_dim	mesh_dimsN)r   r)   )rk   keyvalues      r   rm   ADTensorRedistributePlanner._dict_to_ShardOrder.<locals>.<genexpr>   s*      
/
 DOseElC/s   --)r)   sorteditems)ro   s    r   _dict_to_ShardOrder.DTensorRedistributePlanner._dict_to_ShardOrder   s)      
$QWWY/
 
 	
r,   c                 x    [        [        5      nU  H$  n[        UR                  5      XR                  '   M&     U$ )z1Convert ShardOrder to dict with tensor dim as key)r   r*   rt   rs   )ro   tensor_mesh_dim_dictentrys      r   _ShardOrder_to_dict.DTensorRedistributePlanner._ShardOrder_to_dict   s7      +40E59%//5J !1!12 ##r,   Nmeshtransform_infossrc_placement.src_shard_orderc                 B   [        U5      U R                  :X  d   eUc  [        R                  " U5      n[	        U5      n[
        R                  U5      n[
        R                  [        U5      U5      nU/nU H  nUR                  u  pU	R                  5       (       a6  U	R                  nX;   a  [        X[   5      S:  d   eX[   R                  5         U
R                  5       (       a2  U
R                  nX;  a  / X\'   X\   R                  UR                  5        XUR                  '   [
        R                  [        U5      [
        R                  U5      5      nUR                  U5        M     SR!                  U Vs/ s H  n[#        U5      PM     sn5      $ s  snf )a  
Generate a string representation of the sequence of state transitions
(placements and shard orders) as described by the given transform_info.

Args:
    mesh: The DeviceMesh used for the redistribution.
    transform_infos: A sequence of _TransformInfo objects describing each
        transformation step.
    src_placement: The initial tuple of Placement objects.
    src_shard_order: (Optional) The initial ShardOrder representing
        the mapping of tensor dimensions to mesh dimensions. If None,
        the default shard order is computed from src_placement and mesh.

Returns:
    A string showing the sequence of DistState transitions, separated by '->'.
r   z->)lenndimr   compute_default_shard_orderr*   r-   r   rb   r)   r    is_sharddimpopappendr   rz   joinstr)r   r   r   r   cur_placementshard_order_dict	cur_state
state_listtransform_infosrc_dim_placementdst_dim_placementsrc_dimdst_dim	new_statess                  r   stringify_transform_infos4DTensorRedistributePlanner.stringify_transform_infos   s   . =!TYY...")EEmTO]+5II
 /88- /
	 

 .N3A3T3T0 ))+++///C8H8Q4RUV4VV )--/ ))+++//202$- )001H1HI5F.1122<<m$*>>?OPI i(% .& yy*5*Q#a&*5665s    Fr/   r0   c                     Xl         UR                  5       (       d   eUc   eX l        [        UR                  5      U l        U R                  5         g)z
Initialize DTensorRedistributePlanner.

Args:
    device_mesh: The device mesh for this planner
    dtensor_meta: TensorMeta of the DTensor to redistribute
N)r/   _is_current_rank_part_of_meshr0   r   shapetensor_dimensionsetup_cost_callbacks)rM   r/   r0   s      r   __init__#DTensorRedistributePlanner.__init__!  sP     '88::::'''( #L$6$6 7!!#r,   c                 ^   ^ ^ S[         R                  S[        4U 4S jjmU4S jnUT l        g)z
Set up the cost function for different collective operations.
Uses communication time estimation based on actual tensor sizes and
mesh topology for accurate cost modeling.
stater1   c                 l   > [        TR                  U R                  TR                  U R                  S9$ )N)r   rB   tensor_metashard_order)r   r/   rB   r0   rC   )r   rM   s    r   state_to_specFDTensorRedistributePlanner.setup_cost_callbacks.<locals>.state_to_spec=  s6     %% ++ --!88	 r,   c                 4   > [        T" U 5      T" U5      5      $ rJ   r	   )	src_state	dst_stater   s     r   cost_functionFDTensorRedistributePlanner.setup_cost_callbacks.<locals>.cost_functionG  s    -i(-	*B r,   N)r-   rb   r   r   )rM   r   r   s   ` @r   r   /DTensorRedistributePlanner.setup_cost_callbacks4  s-    	-77			
 +r,   rB   tensor_mesh_dim_tupler@   c                 x   0 n[         R                  U5      nU R                  U R                  U5      U5      nU H  nUR                  n[        U R                  5       H  nXx:X  a  M
  XG   R                  5       n	XH   R                  U	5        [        U5      n
[        U5      X'   U R                  U R                  U
5      [         R                  U5      5      nU R                  UU5      X;'   XG   R                  U	5        XH   R                  5         M     M     U H  nUR                  nXG   R                  5       n	[        U5      n
[        5       X'   U R                  U R                  U
5      [         R                  U5      5      nXG   R                  U	5        U R                  UU5      X;'   M     [        U5       Hh  u  p[        U[         5      (       d  M  [        U5      n
[        5       X'   U R                  U R                  U
5      U5      nU R                  UU5      X;'   Mj     [        U5       H  u  p[        U[        5      (       d  M  [        U R                  5       H  n[        U5      n
[        U5      X'   XH   R                  U5        U R                  U R                  U
5      [         R                  U5      5      nU R                  UU5      X;'   XH   R                  5         M     M     [        U5       H  u  p[        U[         5      (       d  M  [        U R                  5       H  n[        U5      n
[        U5      X'   XH   R                  U5        U R                  U R                  U
5      [         R                  U5      5      nU R                  UU5      X;'   XH   R                  5         M     M     [        U5       Hh  u  p[        U[        5      (       d  M  [        U5      n
[!        5       X'   U R                  U R                  U
5      U5      nU R                  UU5      X;'   Mj     U$ rJ   )r-   r   rb   rj   rs   ranger   r   r   r*   r   rz   r   r   	enumeratera   r   )rM   rB   r   all_next_stater}   cur_dist_stater~   src_tensor_dimdst_tensor_dimmove_mesh_dimnew_placements
dist_statesrc_mesh_dim	placementr   s                  r   get_next_state)DTensorRedistributePlanner.get_next_stateN  s    ` MO9MM! 
 NN:&!
 +E"--N"'(=(=">!3 !5 D H H J$4;;MJ!%j!105n0E-!^^NN>2.BB,
 .2-?-?".*
 %4;;MJ$488:+ #? +: +E"--N0@DDFM!*-N,5KN)~.*>>?STJ !077F)-););*N& +" (1'<#Li11!*-N+4;N(~.0EJ *.););*N& (= $-Z#8Hi33"'(=(=">!%j!1+0+@($4;;HE!^^NN>2.BB,
 .2-?-?".* %488: #? $9, $-Z#8Hi11"'(=(=">!%j!1+0+@($4;;HE!^^NN>2.BB,
 .2-?-?".* %488: #? $9, $-Z#8Hi33!*-N'.yN$~.0EJ *.););*N& $9 r,   r   r   c                    SSK nSnSXAU/4/n[        5       nU(       a  UR                  U5      u  pxpX:X  a  U
$ X;   a  M)  UR                  U	5        U R	                  U	R
                  U	R                  5      nUR                  5        H.  u  pX;  d  M  X}-   nX/-   nUS-  nUR                  X^XLU45        M0     U(       a  M  [        SU SU 35      e)z
Find the min cost path from src_state to dst_state using Dijkstra's
algorithm.

Args:
    src_state: The source state
    dst_state: The destination state

Returns:
    A list of states representing the min cost path from src_state to
    dst_state
r   N   zNo path found from src_state z to dst_state )
heapqsetheappopaddr   rB   rC   ry   heappushAssertionError)rM   r   r   r   counterpqvisitedcost_current_statepathnext_states
next_statetransition_costnew_costnew_paths                   r   find_min_cost_path-DTensorRedistributePlanner.find_min_cost_path  s     	  i[12 	 %+0==+<(D])'KK&--((-*N*NK 0;/@/@/B+
,#5H#l2HqLGNN2'x'PQ 0C b" +I;nYKP
 	
r,   r   full_tensor_shapec           	      V   [        U5      nUR                   H  nUR                  nUR                  n[	        U5      S:  d   eU H[  nX:X  a  M
  [
        R                  " XF   U R                  R                  US9U R                  R                  U5      5      S   n	XU'   M]     M     U$ )Nr   r   )
r*   rC   rs   rt   r   r   local_shard_size_and_offsetr/   size_sym_get_coordinate)
rM   r   r   r   new_logical_shaper~   rs   rt   mdimnew_sizes
             r   get_logical_shape,DTensorRedistributePlanner.get_logical_shape9  s     !!2355E))JIy>A%%%!# <<%1$$))4)8$$88> 	
 19*- "	 6 ! r,   src_specdst_specc           
         [        S UR                   5       5      (       a.  [        R                  " UR                  UR                  5      u  pEOUR                  nUR
                  n[        S UR                   5       5      (       a.  [        R                  " UR                  UR                  5      u  pgOUR                  nUR
                  nUb  Uc  [        S5      eU R                  XE5      nU R                  Xg5      n	/ n
U R                  X5      n[        R                  " U5       H  u  pUR                  UR                  :w  d  M!  Sn[        [        UR                  UR                  5      5       HR  u  nu  nnUU:w  d  M  US:w  a  [        S5      eUnU R                  XU5      nU
R                  [!        UUU4US95        MT     M     U
$ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7frJ   ra   r   rk   r   s     r   rm   RDTensorRedistributePlanner.generate_graph_based_transform_infos.<locals>.<genexpr>W        
BUYJy-00BU   c              3   B   #    U  H  n[        U[        5      v   M     g 7frJ   r   r   s     r   rm   r   b  r   r   zRedistribution of _StridedShard placement is only supported for _StridedShard that can be converted to ordered Shard placements. Full _StridedShard redistribution support is not yet implemented.z@Multiple mesh_dims are different between cur_state and nxt_stater   r    r!   )anyrB   r   &_normalize_placements_into_shard_orderr   r   NotImplementedErrorrb   r   	itertoolspairwiser   zipr   r   r   r   )rM   r   r   r   src_placementsr   dst_placementsdst_shard_orderr   r   r   
state_pathr   	nxt_stateupdate_mesh_dimr   r   nxt_placementr!   s                      r   $generate_graph_based_transform_infos?DTensorRedistributePlanner.generate_graph_based_transform_infosO  s     
BJBUBU
 
 
 BB'' ,NO &00N&22O 
BJBUBU
 
 
 BB'' ,NO &00N&22O"o&=%T 
 NN>C	NN>C	02,,YB
$-$6$6z$B I##y';';;"$@I	,,i.B.BCA<H<}m %5*b0"0 b#  +3(,(>(>%1B) (..*)84A=3Q.;A	 %C0 r,   c           	         [        UR                  5      nU/n/ nU R                  R                  S:X  a8  UR	                  [        SUR                  S   UR                  S   4US95        U$ [        UR                  5       H  u  pgXF   n[        U[        5      (       a  X`R                  R                  S-
  :  a  U R                  R                  US9n	UR                  XR                     U	U R                  R                  U5      5      u  p[        U5      nXUR                  '   UR	                  U5        M  M  UR	                  U5        M     [        UR                  5      n[        UR                  5      nUR                  S:  Ga  [        [!        [#        U5      5      5       H  nX   nX   n[        U[        5      (       a  UR                  n/ / nn[        [%        X5      5       H`  u  nu  nnXo:  a    OTUR'                  U5      (       a  UR	                  U5        UR'                  U5      (       d  MO  UR	                  U5        Mb     UU:w  a
  [)        5       nUU:w  d  M  UR	                  [        UUU4XO   S95        UX'   M     [        [%        X5      5       H3  u  nu  nnUU:w  d  M  UR	                  [        UUU4XO   S95        UX'   M5     U$ )ax  
Generate the transform infos from the source placements to the target placements.

To transform from source to target placement it might have multiple steps, i.e. it
might decompose Si -> Sj into Si -> R -> Sj.
This would detect if there're mis-aligned/nested shardings between src/dst placements.
E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
the former is a nested-sharding of a tensor already already sharded dimension 0, whereas
the latter is the first sharding on tensor dimension 0.
r   r   r   r   )r*   r   r/   r   r   r   rB   r   ra   r   r   _local_shard_size_and_offsetr   r   
num_shardsreversedr   r   r   r   r   )rM   r   r   initial_logical_shapemesh_dims_to_logical_shaper   isrccurrent_logical_shapemesh_dim_sizelocal_shard_sizer   r   current_placementstarget_placementsr   currenttarget	shard_dimcurrent_mesh_shardingtarget_mesh_shardingr   ps                          r   generate_greedy_transform_infos:DTensorRedistributePlanner.generate_greedy_transform_infos  s   $ !%X^^ 4&;%<"02  A% ""(0(;(;A(>@S@STU@V'W"7 #"
   3 34FA$>$A!#u%%'',,q00$($4$4$9$91$9$EM*-*J*J-gg6%((<<Q?+'$
 )--B(C%1Acgg..556GH 1 +112GH 5( "("5"56 !4!45" %U3/A+B%CD,6*4 fe,, !'

IBDb+?)%..B&	6Aq =!::i00188;::i00077:& -0DD
 "+f$#**&%-07/@*D*N 4:&0G EP ,5"6,
'H'w & &&"!),3V+<&@&J 06",,
 r,   )r   r/   r0   r   rJ   r1   N)"r#   r$   r%   r&   __doc__re   	dataclassrb   rj   staticmethoddictr'   r*   r   rz   r   r   r   r   r)   r   r   r   r   r   r   floatr   r   r   r   r   r  r+   r"   r,   r   r-   r-      s    $d3. . 4.` 
tCcN3 

 
 
 $z $d3S	>.B $ $ 
 .2	6767!.167 Y^,67 $d*	67
 
67 67p$$ !$ 
	$&+	+4u)S.)u  *u 
4e;	<	ur0
"0
/80
	4	50
d!9! ! !c?	!
 
c!,@@ @ !c?	@
 
n	@Duu u 
n		ur,   r   r   use_graph_based_transformc                 z   U R                   nU R                  nUR                  nUb  Uc   e[        S XE4 5       5      (       + nUSL a  SnO[        b  [        nOUc  SnU R                  c   e[        UU R                  5      nU(       a  UR                  XU R                  5      nU$ UR                  X5      nU$ )Nc              3   N   #    U  H  n[         R                  " U5      v   M     g 7frJ   )r   is_default_device_order)rk   orders     r   rm   2_gen_transform_infos_non_cached.<locals>.<genexpr>  s%      $7E 	++E227s   #%TF)	r/   r   allr   r   r7   r   r   r  )	r   r   r  r/   r   r   has_non_default_orderdrpr   s	            r   _gen_transform_infos_non_cachedr  	  s    
 &&K**O**O &?+FFF
 !$ $%7$ ! 
 $$(!	,	8$G!	"	*$)!+++
"C !BB

  ==hQr,   c                     [        XU5      $ rJ   )r  )r   r   r  s      r   _gen_transform_infosr!  1  s     +5 r,   F)async_opr  local_tensorcurrent_spectarget_specr"  c                ~   UR                   UR                   :w  a  [        S5      eU nUR                   nUR                  5       (       d  U $ [        5       (       a  [	        XU5      nO[        XU5      n[        5       nUbR  UR                  U UR                  UR                  [        R                  UUUR                  UR                  5      5      O[        R                  " 5       n	U	   U GH  n
U
R                  nU
R                  u  pUR!                  US9nX:X  a  U nM6  US:X  a  U nM@  UR#                  5       (       a  UR%                  5       (       a$  ['        [(        U5      nUR+                  XU5      nGOUR-                  5       (       a.  ['        [.        U5      nUR1                  XXR2                  5      nGO[5        SU SU S35      eUR-                  5       (       a  ['        [.        U5      nUR%                  5       (       a%  ['        [(        U5      nUR7                  XUU5      nGO8UR#                  5       (       a#  UR9                  XXR;                  U5      5      nGO UR-                  5       (       d
   SU 35       e['        [.        U5      nUR<                  UR<                  :w  a)  UR?                  U UUU
R2                  UR<                  5      nOUR%                  5       (       ax  UR#                  5       (       a#  ['        [(        U5      nURA                  XU5      nO@UR-                  5       (       a  [5        SU SU S35      eX:w  a  [C        S	U S
U S35      eU nU(       d/  [E        U[F        RH                  5      (       a  URK                  5       nUn GM     SSS5        U$ ! , (       d  f       U$ = f)z
This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
the target DTensorSpec, which involves the necessary collective calls to transform
the local shard of the DTensor from its current spec to the target spec.
z)Cross device mesh comm not supported yet!Nr   r   zredistribute from z to z not supported yetz,Current placement should be shard but found z&Redistribution from one partial type (z) to another (z) is unsupported.)&r   r   r   r   r  r!  r   record_redistribute_callsrB   r-   r   r   
contextlibnullcontextr   r    r   is_replicate
is_partialr   r   _reduce_valuer   r   _to_replicate_tensorr!   RuntimeError_reduce_shard_value_replicate_to_shardr   r   _to_new_shard_dim_partition_valuer   ra   funcolAsyncCollectiveTensorwait)r#  r$  r%  r"  r  new_local_tensorr/   r   
debug_moderedistribute_contextr   r  r  r	  
num_chunkspartial_speccurrent_placementtarget_placement
shard_specs                      r   redistribute_local_tensorr>  <  s    K,,,!"MNN###K4466 9'@
 /'@
 '(J ! 	,,##""&@@''((		
	
 ##%   
-N''A,??OG$))1)5J #/ Q $0 ""$$%%''#'#9L'3'A'A$1($ %%''(,UG(<%'8'M'M$16R6R($ ',WIT&AST  ""#'v#6 %%''#'#9L'3'G'G$16F($ ))++'7'K'K$16U6UVW6X($ #++-- FwiP- "&eW!5J!~~)9)=)==+5+G+G('*88,00,( ""$$''))#'#8L'3'D'D$1($ %%''&,WIT&AST  (,DWI^\b[cctu  (4$
 &">">! ! $4#8#8#: +Lc . 
f g 
	f s   )J:N--
N<c                       \ rS rSr\   SSSS\S\\S4   S\S	\	R                  S-  S
\	R                  S-  4S jj5       r\SS j5       rSrg)Redistributei  Ninputdtensor.DTensorr/   rB   .r"  forward_dtypebackward_dtypec           
         X@l         X`l        UR                  R                  U l        Ubs  XQR                  R                  :w  aZ  UR                  R                  US9n[        UUR                  R                  [        UR                  UR                  5       US9S9nOUR                  nUR                  nXl        UR                  U:w  a  [        X#UR                  S9n	[        XxXS9n
OUn
Un	[        R                   " U
U	UR"                  S9$ )Ndtyper   striderG  r   rB   r   r   r"  requires_grad)r"  rD  _local_tensorrG  original_dtypetor   _specrB   r   r   rI  r$  r   r>  dtensorDTensorrN  )ctxrA  r/   rB   r"  rC  rD  r#  r$  r%  outputs              r   forwardRedistribute.forward  s     +"0066$:M:M:S:S)S ..111FL&  ;;11&++ <<>'L !..L ;;L'""j0%\5M5MK /KF
 "F&K --
 	
r,   c           
         U R                   nU R                  nU R                  =(       d    U R                  nXAR                  R
                  :w  a  UR                  R                  US9n[        UR                  R                  UR                  R                  [        UR                  UR                  5       US9S9n[        UR                  UR                  UR                  S9nOUR                  nUR                  n/ n[        UR                  UR                  5       Hp  u  pUR!                  5       (       d  UR#                  5       (       a0  U	R%                  5       (       a  UR'                  [)        5       5        M_  UR'                  U	5        Mr     [        UR                  [+        U5      UR                  S9n[-        UUUUS9n
U
R
                  U R                  :w  a  U
R                  U R                  5      n
[        UR                  [+        U5      [        UR                  UR                  5       U
R
                  S9S9n[.        R0                  " U
UUR2                  S9nUS S S S S 4$ )NrF  rH  rJ  )rB   r   rL  rK  rM  )r$  r"  rD  rP  rO  rG  rQ  r   rR  r/   rB   r   r   rI  r   r   r   r*  r+  r   r   r)   r>  rS  rT  rN  )rU  grad_outputprevious_specr"  rD  r#  r$  normalized_placementsr  r	  rV  specoutput_dtensors                r   backwardRedistribute.backward  s3   ((<<++As/A/A66<<<&4477n7ML& &&22&,,77&%++&--/(L ("..(33(44M '44L&,,L 24"<#:#:M<T<TUOG  ""g&:&:&<&<&BSBSBUBU%,,Y[9%,,V4	  V $%%23%11
 +	
 <<3---YYs112F%%'("!''"))+ll
 !%33
 
 	
r,   r"   )FNN)rZ  rB  )r#   r$   r%   r&   r  r   r)   r   rg   torchrG  rW  r_  r+   r"   r,   r   r@  r@    s     ,0-13
 !3
  	3

 )S.)3
 3
 {{T)3
 d*3
 3
j Q
 Q
r,   r@  )Tr  rJ   )Ar(  re   r   loggingr3   collectionsr   collections.abcr   	functoolsr   typingr   r   ra  )torch.distributed._functional_collectivesdistributed_functional_collectivesr3  torch.distributed.tensor._apitensor_apirS  r   *torch.distributed.tensor._collective_utilsr
   &torch.distributed.tensor._dtensor_specr   r   r   r   $torch.distributed.tensor.device_meshr   (torch.distributed.tensor.placement_typesr   r   r   r   r   torch.utils._debug_moder   	getLoggerr#   loggerr   rg   r(   contextmanagerr   r   r.   r  r)   ReferenceTyper7   r:   r-   r*   r  r!  Tensorr>  autogradFunctionr@  r"   r,   r   <module>ry     s        # $  #  : : / / E Q  <  : 
		8	$ 48 #TD[ 7 98d 98 98xZ   	'



+Z
78 " %%% "%4
p	 p	l .2%%%  $d{% 
.	%P  .2  $d{ 
.	  -1G,,GG G
 G  $d{G \\GTI
5>>** I
r,   