
    ȅi                        S SK r S SKrS SKrS SKJrJrJrJr  S SKrS SK	J
r  S SKJ
s  Jr  S SKJr  S SKJr  S SKJr  SSKJr   S SKJr   S S	KJr     \\!\"   \!\!\"      \RF                  \\$S\"4   \RJ                  4   r& S r'SS\RP                  S\"S\&S\)4S jjr*SS\RP                  S\)S\&S\)4S jjr+ SS\RP                  S\"S\&S\)S\RP                  4
S jjr, SS\RP                  S\"S\&S\)4S jjr- SS\RP                  S\)S\"S\&S\)4
S jjr. SS\RP                  S\)S\"S\&S\)4
S jjr/ SS\!\RP                     S\)S\&S\)S\!\RP                     4
S jjr0 SS\!\RP                     S\&S\)S\!\RP                     4S jjr1 SS \!\RP                     S\)S\!\"   S\&S\)S\!\RP                     4S! jjr2S" r3 SS\RP                  S#\!\"   S-  S$\!\"   S-  S\&S\)S\RP                  4S% jjr4 SS\RP                  S#\!\"   S-  S$\!\"   S-  S\&S\)S\RP                  4S& jjr5S'\RP                  4S( jr6S) r7\Rp                  Rs                  S*\6\7S+9  S'\RP                  4S, jr:S- r;\Rp                  Rs                  S.\:\;S+9  S'\RP                  4S/ jr<S0 r=\Rp                  Rs                  S1\<\=S+9  S'\RP                  4S2 jr>S3 r?\Rp                  Rs                  S4\>\?S+9  S'\RP                  4S5 jr@S6 rA\Rp                  Rs                  S7\@\AS+9  S8\!\RP                     4S9 jrBS: rC\Rp                  Rs                  S;\B\CS+9  S8\!\RP                     4S< jrDS= rE\Rp                  Rs                  S>\D\ES+9  S8\!\RP                     4S? jrFS@ rG\Rp                  Rs                  SA\F\GS+9   SS\RP                  SB\!\"   S\&S\)S\RP                  4
SC jjrH " SD SE\RP                  5      rI SS\&S\)S\$\)\!\"   \"4   4SF jjrJSS\&S\)S\RJ                  4SG jjrK " SH SI\R                  R                  5      rN\Rp                  R                  SJSKSLSM9SN\RP                  S\RP                  4SO j5       rP\PR                  SN\RP                  S\RP                  4SP j5       rRS'\RP                  4SQ jrSSR rT\PRs                  \S\TS+9  S\U4SS jrVS\RP                  4ST jrW\ R                  SSU\U4SV jj5       rYSW rZSX r[SY r\SZ r]S[ r^S\ r_S] r`S^ raS_ rbS` rcSa rdSb reSc rfSd rgSe rhSf riSg rjSh rkSi rl\Rp                  R                  SjSk5      rn\nR                  Sl\]Sm5        \nR                  Sn\bSm5        \nR                  So\aSm5        \nR                  Sp\dSm5        \nR                  Sq\^Sm5        \nR                  Sr\gSm5        \nR                  Ss\hSm5        \nR                  St\iSm5        \nR                  Su\jSm5        \nR                  Sv\kSm5        \nR                  Sw\lSm5        \nR                  Sx\fSm5        \nR                  Sy\\Sm5        \nR                  Sz\cSm5        \R                  R                  R                  \R                  R                  RN                  R                  5        \R                  R                  R                  \R                  R                  RN                  5        \Rp                  R                  S{S|5      rv\Rp                  R                  S{Sk5      rw/ S}Qrx\R                  \z   r{\x H]  r|\|S \|R                  S~5       r~\" \S\~ 35      r\vGR                  \|\GR                  GR                  S9  \wR                  \~\S5        M_          SS\RP                  S\RP                  S\US\)S\"4
S jjr     SS\RP                  SN\RP                  S\)S\US\"S\)4S jjr\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S\GR                  GR                  S0r    SS\RP                  S\)S\US\)4S jjr     SS\RP                  SN\RP                  S\)4S jjr   SS\!\RP                     S\RP                  S\)4S jjrS SKJrJrJrJrJ+rJ4rJ.r  \\\\\\\\\\\\\\0rg! \ a
    S SKJr   GNf = f! \ a    \R@                  " S
SS9  S r G	N	f = f)    N)AnycastTYPE_CHECKINGUnion)_maybe_view_chunk_cat)
DeviceMesh)get_proxy_mode   )_functional_collectives_impl)tree_map_only)is_dynamo_compilingzdUnable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly   
stacklevelc                      g)NF r       c/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/_functional_collectives.pyis_torchdynamo_compilingr      s    r   zdist.tensor.DeviceMeshc                 T    [         R                  R                  R                  U 5      $ )z
Wait on a tensor returned by the collectives ops.

Waiting follows device semantics, which means blocking on CPU and synchronizing streams on CUDA.
)torchops_c10d_functionalwait_tensor)tensors    r   r   r      s     99%%11&99r   selfsrcgrouptagc                     [        X#5      n[        R                  R                  R	                  XU5      n[        U5      $ )a  
Broadcasts the tensor to all processes in the given process group.

Args:
    src (int): Source rank
    group (ProcessGroup or List[int]): The process group to work on.
    tag (str, optional): A unique identifier for the collective. Default: empty string
)_resolve_group_namer   r   r   	broadcast_maybe_wrap_tensor)r   r   r   r   
group_namer   s         r   r"   r"      s5     %U0JYY''11$ZHFf%%r   reduceOpc                     [        X#5      n[        R                  R                  R	                  XR                  5       U5      n[        U5      $ )a  
Reduces the tensor data across all machines in such a way that all get
the final result.

The input tensor is left unmodified.

Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
)r!   r   r   r   
all_reducelowerr#   )r   r%   r   r   r$   r   s         r   r'   r'      s<    " %U0JYY''2249I:VFf%%r   
gather_dimreturnc                 `   U R                  5       (       d  [        S5      e[        X#5      n[        R                  " U5      n[
        R                  R                  R                  XU5      n[        U5      nUS:w  a1  [        U[        5      (       a  UR                  5       n[        XuU5      nU$ )a  
Gather tensor data across from all machines and concatenate over ``gather_dim``.

Note that it currently only supports gather_dim = 0.

The input tensor is left unmodified.
Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
z/Tensor must be contiguous for all_gather_tensorr   )is_contiguousAssertionErrorr!   c10d_get_group_size_by_namer   r   r   all_gather_into_tensorr#   
isinstanceAsyncCollectiveTensorwaitr   r   r)   r   r   r$   
group_sizer   ress           r   all_gather_tensorr7      s    , NOO$U0J--j9JYY''>>*F V
$CQ c011((*C#CZ@Jr   c                 l   [        X#5      n[        R                  " U5      n[        R                  R
                  R                  XU5      n[        R                  U5      nUS:w  aM  [        U[        5      (       a  UR                  5       n[        R                  " [        R                  " XuSS9US9nU$ )a$  
Gather tensor data across from all machines and concatenate over ``gather_dim``.

Note that it currently only supports gather_dim = 0.

This function is the same as all_gather_tensor but will propagate the
backwards gradient across workers.

See all_gather_tensor for more details on usage.
r   dim)r!   r.   r/   r   r   _c10d_functional_autogradr0   _FromTorchTensorapplyr1   r2   r3   catchunkr4   s           r   all_gather_tensor_autogradr@      s      %U0J--j9JYY00GG*F 
 
 
(CQ c011((*CiiC;LJr   scatter_dimc                    [        X45      n[        R                  " U5      nU R                  U5      U-  S:w  a!  [	        SU R                  S5       SU S35      eUS:w  a+  [
        R                  " XUS9n[
        R                  " U5      n [
        R                  R                  R                  U UR                  5       UU5      n[        U5      n	U	$ )a  
Reduces the tensor data across all machines in such a way that all get
the final result, then scatter the results to corresponding ranks.


The input tensor is left unmodified.
Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
r   input dimension 0 (" must be a multiple of group_size )r9   )r!   r.   r/   sizer-   r   r?   r>   r   r   reduce_scatter_tensorr(   r#   
r   r%   rA   r   r   r$   r5   tensor_listr   r6   s
             r   rG   rG      s    , %U0J--j9Jyy
*a/!$))A,/QR\Q]]^_
 	
 akk$Dyy%YY''==	F V
$CJr   c                    [        X45      n[        R                  " U5      nU R                  U5      U-  S:w  a   [	        SU R                  S5       SU 35      eUS:w  a+  [
        R                  " XUS9n[
        R                  " U5      n [
        R                  R                  R                  U UR                  5       UU5      n[        R                  U5      n	U	$ )a`  
Reduces the tensor data across all machines in such a way that all get
the final result, then scatter the results to corresponding ranks.

This function is the same as reduce_scatter_tensor but will propagate the
backwards gradient across workers.

Currently only the "sum" reduceOp is supported.

See reduce_scatter_tensor for more details on usage.
r   rC   rD   r9   )r!   r.   r/   rF   r-   r   r?   r>   r   r;   rG   r(   r<   r=   rH   s
             r   reduce_scatter_tensor_autogradrK   %  s    & %U0J--j9Jyy
*a/!$))A,/QR\Q]^
 	
 akk$Dyy%YY00FF	F 
 
 
(CJr   c                     [        X#5      n[        R                  R                  R	                  U UR                  5       U5      n[        [        [        U5      5      $ )a  
Reduces a list of tensors across all machines in such a way that all get
the final result.

The all tensors in the input list are left unmodified.

Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
)	r!   r   r   r   all_reduce_coalescedr(   listmapr#   )r   r%   r   r   r$   rI   s         r   rM   rM   M  sM    & %U0J)),,AAK
 &455r   c                     [        X5      n[        R                  " U5      n[        R                  R
                  R                  U UU5      n[        [        [        U5      5      $ )a  
Gather a list of tensors across from all machines.

Note that it currently only supports gather_dim = 0.

The input tensor is left unmodified.
Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
)
r!   r.   r/   r   r   r    all_gather_into_tensor_coalescedrN   rO   r#   )r   r   r   r$   r5   rI   s         r   rQ   rQ   i  sV    & %U0J--j9J)),,MMK
 &455r   inputsc                    [        X45      n[        R                  " U5      n[        U5      [        U 5      :w  a$  [	        S[        U5       S[        U 5       S35      e[        [        X 5      5       H{  u  nu  pU	R                  U5      U-  S:w  a&  [	        SU SU	R                  U5       SU SU 35      eUS:w  d  MN  [        R                  " XUS	9n
[        R                  " U
5      X'   M}     [        R                  R                  R                  U UR                  5       UU5      n
[        [!        ["        U
5      5      $ )
a  
Reduces a list of tensors across all machines in such a way that all get
the final result, then scatter the results to corresponding ranks.

The input tensors are left unmodified.
Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
zLength of scatter_dim (z) must equal length of inputs (rE   r   zinput dimension z (rD   z for tensor at index r9   )r!   r.   r/   lenr-   	enumerateziprF   r   r?   r>   r   r   reduce_scatter_tensor_coalescedr(   rN   rO   r#   )rR   r%   rA   r   r   r$   r5   idxr:   r   rI   s              r   rW   rW     sC   , %U0J--j9J
;3v;&%c+&6%77VWZ[aWbVccde
 	
 (K(@A]c;;sj(A- "3%r&++c*:);;]^h]ii~  @C  D  E  !8++fcBK))K0FK B )),,LL	K &455r   c                    [        U [        R                  R                  5      (       d  [	        S[        U 5       35      e[        R                  R                  U R                  5       [        R                  R                  5      (       a  gU R                  n[        UR                  5      S:  a?  UR                  S   nUR                  S L=(       a    UR                  R                  (       + $ g )Nz$Expected torch._ops.OpOverload, got Fr   )r1   r   _ops
OpOverloadr-   type_C%_dispatch_has_kernel_for_dispatch_keynameDispatchKeyCompositeImplicitAutograd_schemarT   	arguments
alias_infois_write)tgtschema	first_args      r   _is_view_opri     s    c5::0011CDI;OPP xx55
E%%??  [[F
6q $$Q'	##4/U	8L8L8U8U4UU !r   output_split_sizesinput_split_sizesc                    Ub%  [        S U 5       5      (       d  [        SU 35      eUb%  [        S U 5       5      (       d  [        SU 35      e[        X45      n[        R                  " U5      nUb  Uc)  Uc  Ub  [        S5      eU R
                  S   U-  /U-  nUn[        R                  R                  R                  U UUU5      n[        U5      $ )a  
Each process splits input tensor and then scatters the split list
to all processes in a group. Then concatenate the received tensors from all
the processes in the group and return single output tensor.

Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh

:: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
that information and perform collective algebraic optimization. Use other forms of input for that.
c              3   b   #    U  H%  n[        U[        [        R                  45      v   M'     g 7fNr1   intr   SymInt.0rF   s     r   	<genexpr>$all_to_all_single.<locals>.<genexpr>  '      
>PdJtc5<<011>P   -/2All output_split_sizes must be int or SymInt, got c              3   b   #    U  H%  n[        U[        [        R                  45      v   M'     g 7frn   ro   rr   s     r   rt   ru     %     WEVT:dS%,,$788EVrw   1All input_split_sizes must be int or SymInt, got ^output_split_sizes and input_split_sizes must either be specified together or both set to Noner   )allr-   r!   r.   r/   shaper   r   r   all_to_all_singler#   r   rj   rk   r   r   r$   r5   r   s           r   r   r     s   , % 
>P
 
 
 !DEWDXY  $WEVWWW CDUCVW  %U0J--j9J!%6%>"*/@/H 9  #jjmz9:ZG.YY''99	F f%%r   c                    Ub%  [        S U 5       5      (       d  [        SU 35      eUb%  [        S U 5       5      (       d  [        SU 35      e[        X45      n[        R                  " U5      nUb  Uc)  Uc  Ub  [        S5      eU R
                  S   U-  /U-  nUn[        R                  R                  R                  U UUU5      n[        R                  U5      $ )z2
Same as all_to_all_single but supports autograd.
c              3   b   #    U  H%  n[        U[        [        R                  45      v   M'     g 7frn   ro   rr   s     r   rt   -all_to_all_single_autograd.<locals>.<genexpr>  rv   rw   rx   c              3   b   #    U  H%  n[        U[        [        R                  45      v   M'     g 7frn   ro   rr   s     r   rt   r     rz   rw   r{   r|   r   )r}   r-   r!   r.   r/   r~   r   r   r;   r   r<   r=   r   s           r   all_to_all_single_autogradr     s    % 
>P
 
 
 !DEWDXY  $WEVWWW CDUCVW  %U0J--j9J!%6%>"*/@/H 9  #jjmz9:ZG.YY00BB	F !!&))r   grad_outputc                     U$ )z
Backward for wait_tensor: identity (no-op).
Wait is just a synchronization primitive, so gradient flows through unchanged.

Args:
    ctx: Context object
    grad_output: Gradient from downstream operations

Returns:
    Gradient unchanged (identity)
r   ctxr   s     r   wait_tensor_backwardr   ,  s
     r   c                     g)z
Setup context for wait_tensor backward.
Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (tensor,)
    output: Output from forward pass
Nr   r   rR   outputs      r   wait_tensor_setup_contextr   ;  s     r   z_c10d_functional::wait_tensor)setup_contextc                     U R                   nU R                  nUS:w  a  [        SU S35      e[        R                  R
                  R                  UR                  5       X25      n[        U5      SS4$ )aS  
Backward for all_reduce: all_reduce with same reduce_op.
Forward aggregates tensors, backward aggregates gradients.

Args:
    ctx: Context object
    grad_output: Gradient from downstream operations

Returns:
    Tuple of (grad_input, grad_group_name, grad_reduce_op)
    grad_group_name and grad_reduce_op are None (not differentiable)
sumz8all_reduce backward only supports 'sum' reduction, got ''N)	r$   	reduce_opRuntimeErrorr   r   r   r'   
contiguousr   )r   r   r$   r   r   s        r   all_reduce_backwardr   M  sv     JIEFykQRS
 	

 YY''22 )F vd**r   c                 D    Uu  p4nXPl         UR                  5       U l        g)z
Setup context for all_reduce backward.
Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (input, reduce_op, group_name)
    output: Output from forward pass
Nr$   r(   r   )r   rR   r   inputr   r$   s         r   all_reduce_setup_contextr   i  s!     $* EjNOO%CMr   z_c10d_functional::all_reducec                     U R                   nU R                  n[        R                  R                  R                  UR                  5       SUU5      n[        U5      SS4$ )a  
Backward for all_gather_into_tensor: reduce_scatter with sum.

Forward gathers tensors from all ranks, backward scatters gradients back
with sum reduction.

Args:
    ctx: Context object with group_name and group_size
    grad_output: Gradient from downstream operations

Returns:
    Tuple of (grad_input, grad_group_size, grad_group_name)
    grad_group_size and grad_group_name are None (not differentiable)
r   N)r$   r5   r   r   r   rG   r   r   )r   r   r$   r5   r   s        r   all_gather_into_tensor_backwardr   }  s[     JJ YY''== 	F vd**r   c                 &    Uu  p4nXPl         X@l        g)z
Setup context for all_gather_into_tensor backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (input, group_size, group_name)
    output: Output from forward pass
Nr$   r5   )r   rR   r   r   r5   r$   s         r   $all_gather_into_tensor_setup_contextr     s     %+!EzNNr   z(_c10d_functional::all_gather_into_tensorc                    U R                   nU R                  nU R                  nUS:w  a  [        SU S35      e[        R
                  R                  R                  UR                  5       UU5      n[        U5      SSS4$ )a  
Backward for reduce_scatter_tensor: all_gather.

Forward reduces and scatters tensors to ranks, backward gathers gradients
from all ranks.

Args:
    ctx: Context object with group_name, group_size, and reduce_op
    grad_output: Gradient from downstream operations

Returns:
    Tuple of (grad_input, grad_reduce_op, grad_group_size, grad_group_name)
    grad_reduce_op, grad_group_size, grad_group_name are None (not differentiable)
r   zCreduce_scatter_tensor backward only supports 'sum' reduction, got 'r   N)
r$   r5   r   r   r   r   r   r0   r   r   )r   r   r$   r5   r   r   s         r   reduce_scatter_tensor_backwardr     s     JJI EQR[Q\\]^
 	

 YY''>> F
 vdD00r   c                 P    Uu  p4pVX`l         XPl        UR                  5       U l        g)z
Setup context for reduce_scatter_tensor backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (input, reduce_op, group_size, group_name)
    output: Output from forward pass
Nr$   r5   r(   r   )r   rR   r   r   r   r5   r$   s          r   #reduce_scatter_tensor_setup_contextr     s&     06,EjNNOO%CMr   z'_c10d_functional::reduce_scatter_tensorc                     U R                   nU R                  nU R                  n[        R                  R
                  R                  UR                  5       UUU5      n[        U5      SSS4$ )a  
Backward for all_to_all_single: all_to_all with reversed split sizes.

Forward does all-to-all with specified split sizes, backward reverses them.

Args:
    ctx: Context object with group_name, output_split_sizes, and input_split_sizes
    grad_output: Gradient from downstream operations

Returns:
    Tuple of (grad_input, grad_output_split_sizes, grad_input_split_sizes, grad_group_name)
    All except grad_input are None (not differentiable)
N)	r$   rj   rk   r   r   r   r   r   r   )r   r   r$   rj   rk   r   s         r   all_to_all_single_backwardr     sl     J//-- YY''99 	F vdD00r   c                 2    Uu  p4pVX`l         X@l        XPl        g)z
Setup context for all_to_all_single backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (input, output_split_sizes, input_split_sizes, group_name)
    output: Output from forward pass
N)r$   rj   rk   )r   rR   r   r   rj   rk   r$   s          r   all_to_all_single_setup_contextr     s"     @F<E0N/-r   z#_c10d_functional::all_to_all_singlegrad_outputsc                 0   U R                   nU R                  nUS:w  a  [        SU S35      e[        R                  R
                  R                  U Vs/ s H  oDR                  5       PM     snUU5      n[        [        [        U5      5      SS4$ s  snf )a  
Backward for all_reduce_coalesced: all_reduce each gradient.

Forward aggregates tensors, backward aggregates gradients.

Args:
    ctx: Context object with group_name and reduce_op
    grad_outputs: Gradients from downstream operations (one per input tensor)

Returns:
    Tuple of (grad_inputs..., grad_reduce_op, grad_group_name)
    grad_reduce_op and grad_group_name are None (not differentiable)
r   zBall_reduce_coalesced backward only supports 'sum' reduction, got 'r   N)r$   r   r   r   r   r   rM   r   rN   rO   r   )r   r   r$   r   r   grad_inputss         r   all_reduce_coalesced_backwardr     s     JIEPQZP[[\]
 	

 )),,AA5AB\k			!\BK
 [+./t<<	 	Cs   Bc                 D    Uu  p4nXPl         UR                  5       U l        g)z
Setup context for all_reduce_coalesced backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (tensor_list, reduce_op, group_name)
    output: Output from forward pass
Nr   )r   rR   r   rI   r   r$   s         r   "all_reduce_coalesced_setup_contextr   7  s!     *0&KJNOO%CMr   z&_c10d_functional::all_reduce_coalescedc                    U R                   nU R                  n[        R                  R                  R                  U Vs/ s H  oDR                  5       PM     snSUU5      n[        [        [        U5      5      SS4$ s  snf )a  
Backward for all_gather_into_tensor_coalesced: reduce_scatter each gradient.

Forward gathers tensors from all ranks, backward scatters gradients back
with sum reduction.

Args:
    ctx: Context object with group_name and group_size
    grad_outputs: Gradients from downstream operations (one per input tensor)

Returns:
    Tuple of (grad_inputs..., grad_group_size, grad_group_name)
    grad_group_size and grad_group_name are None (not differentiable)
r   N)
r$   r5   r   r   r   rW   r   rN   rO   r   )r   r   r$   r5   r   r   s         r   )all_gather_into_tensor_coalesced_backwardr   L  su     JJ )),,LL5AB\k			!\B	K [+./t<< 	Cs    A?c                 &    Uu  p4nXPl         X@l        g)z
Setup context for all_gather_into_tensor_coalesced backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (tensor_list, group_size, group_name)
    output: Output from forward pass
Nr   )r   rR   r   rI   r5   r$   s         r   .all_gather_into_tensor_coalesced_setup_contextr   h  s     +1'KZNNr   z2_c10d_functional::all_gather_into_tensor_coalescedc                 J   U R                   nU R                  nU R                  nUS:w  a  [        SU S35      e[        R
                  R                  R                  U Vs/ s H  oUR                  5       PM     snUU5      n[        [        [        U5      5      SSS4$ s  snf )a  
Backward for reduce_scatter_tensor_coalesced: all_gather each gradient.

Forward reduces and scatters tensors to ranks, backward gathers gradients
from all ranks.

Args:
    ctx: Context object with group_name, group_size, and reduce_op
    grad_outputs: Gradients from downstream operations (one per input tensor)

Returns:
    Tuple of (grad_inputs..., grad_reduce_op, grad_group_size, grad_group_name)
    grad_reduce_op, grad_group_size, grad_group_name are None (not differentiable)
r   zMreduce_scatter_tensor_coalesced backward only supports 'sum' reduction, got 'r   N)r$   r5   r   r   r   r   r   rQ   r   rN   rO   r   )r   r   r$   r5   r   r   r   s          r   (reduce_scatter_tensor_coalesced_backwardr   }  s     JJI E[\e[ffgh
 	

 )),,MM5AB\k			!\BK
 [+./tTBB	 	Cs   !B c                 P    Uu  p4pVX`l         XPl        UR                  5       U l        g)z
Setup context for reduce_scatter_tensor_coalesced backward.

Args:
    ctx: Context object to save state for backward
    inputs: Tuple of (tensor_list, reduce_op, group_size, group_name)
    output: Output from forward pass
Nr   )r   rR   r   rI   r   r5   r$   s          r   -reduce_scatter_tensor_coalesced_setup_contextr     s&     6<2KJNNOO%CMr   z1_c10d_functional::reduce_scatter_tensor_coalescedsrc_dstc                 Z   [        X#5      u  pEn[        R                  " XEU5      nS/U-  nS/U-  n	[        U5       H_  u  pU
[        R
                  " U5      :X  a  U R                  5       X'   U[        R
                  " U5      :X  d  MM  U R                  5       X'   Ma     [        XXU5      $ )a  
Permutes the elements of the tensor according to the given source/destination pairs. `src_dst` should
be defined such that src_dst[m] == n means m sends to n.

Group can be one of:
    List[int]: ranks participating in the collective.
    List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
    ProcessGroup: Will perform a collective using the ranks and tag of the PG.
    DeviceMesh: Do a SPMD collective over all ranks of the mesh
    (DeviceMesh, int): Do a MPMD collective over one
r   )_expand_groupr.   #_find_or_create_pg_by_ranks_and_tagrU   distget_ranknumelr   )r   r   r   r   tranksetr5   local_pgrj   rk   r   dsts               r   permute_tensorr     s    " +56A
77JOHz)j(g&$--))%)ZZ\"$--))&*jjl#	 ' T7HQTUUr   c                      \ rS rSr% Sr\R                  \S'   \\S'   SS/r	\
S\R                  4S j5       rS rS r\
S 5       r SS
\S\S	-  4S jjrS\4S jrS rS\R                  4S jrS r\SS j5       rS rSrg	)r2   i  a  
A Tensor wrapper subclass that is used to trigger a call to wait
prior to first use of the underlying tensor.
Use it inside functional collective pytorch wrappers like the following:
def functional_collective(self, group, tag):
    tag, rankset, group_size = _expand_group(group, tag)
    tensor = torch.ops.c10d_functional.{collective}(self, tag, rankset, group_size)
    return _maybe_wrap_tensor(tensor)
elem	completedc                    [         R                  R                  U UR                  5       UR	                  5       UR                  5       UR                  UR                  UR                  UR                  S9nXl
        SUl        U$ )N)stridesstorage_offsetdtypelayoutdevicerequires_gradF)r   Tensor_make_wrapper_subclassrF   strider   r   r   r   r   r   r   )clsr   rs      r   __new__AsyncCollectiveTensor.__new__  sm    LL//IIKKKM..0**;;;;,, 0 	
 r   c                     S/S 4$ )Nr   r   r   s    r   __tensor_flatten__(AsyncCollectiveTensor.__tensor_flatten__  s    x~r   c                 >    U R                  5       R                  5       $ rn   )trigger_waittolistr   s    r   r   AsyncCollectiveTensor.tolist  s      "))++r   c                 >    Ub  [        S5      eU S   n[        U5      $ )Nz5meta must be None for AsyncCollectiveTensor unflattenr   )r-   r2   )inner_tensorsmeta
outer_sizeouter_strider   s        r   __tensor_unflatten__*AsyncCollectiveTensor.__tensor_unflatten__  s/     G  V$$T**r   Nexpected_metadataexpected_typec                 J    U[         R                  La  g U R                  5       $ rn   )r   r   r   )r   r   r   s      r   #__coerce_same_metadata_as_tangent__9AsyncCollectiveTensor.__coerce_same_metadata_as_tangent__  s"     ,  ""r   r*   c                 *    SU R                  5        S3$ )NzAsyncCollectiveTensor(rE   )r   r   s    r   __repr__AsyncCollectiveTensor.__repr__  s    '(9(9(;'<A>>r   c                 x    U R                   (       d  [        U R                  5      nSU l         U$ U R                  $ NT)r   r   r   )r   outs     r   r   "AsyncCollectiveTensor.trigger_wait  s-    ~~dii(C!DNJ99r   c                 ,    [        U R                  5      $ rn   )r   r   r   s    r   r3   AsyncCollectiveTensor.wait  s    499%%r   c                     U R                   $ )zOThis method enables  _functional_collectives_impl to test if a tensor is an ACS)r   r   s    r   _get_acs_underlying_tensor0AsyncCollectiveTensor._get_acs_underlying_tensor  s    yyr   c                   ^ U[         R                  R                  R                  R                  L a&  U" US   R
                  US   5      n[        U5      nU$ [        U5      mS[        4U4S jjnS[         R                  4S jn[        [        Xs5      n	[        [        Xt5      n
U" U	0 U
D6nT(       a  [        [         R                  X5      nU$ )Nr   r
   ec                 J   > T(       d  U R                  5       $ U R                  $ rn   )r   r   )r   
is_view_ops    r   unwrap8AsyncCollectiveTensor.__torch_dispatch__.<locals>.unwrap)  s    ~~''66Mr   c                 \    [        U [        5      (       a  [        S5      e[        U 5      nU$ )NzICannot wrap an AsyncCollectiveTensor inside another AsyncCollectiveTensor)r1   r2   r-   )r   r6   s     r   wrap6AsyncCollectiveTensor.__torch_dispatch__.<locals>.wrap/  s1    !233$_  (*CJr   )
r   r   atenviewdefaultr   r2   ri   r   r   )r   functypesargskwargsr6   wrapper_resr   r   unwrapped_argsunwrapped_kwargsr   r   s               @r   __torch_dispatch__(AsyncCollectiveTensor.__torch_dispatch__  s    599>>&&... tAw||T!W-C/4K &
	+ 		ELL 	 ''<fK()>O N7&67 d8C
r   c                 >    U R                  5       R                  5       $ rn   )r3   numpyr   s    r   r
  AsyncCollectiveTensor.numpyD  s    yy{  ""r   )r   rn   )r   N)__name__
__module____qualname____firstlineno____doc__r   r   __annotations__bool	__slots__staticmethodr   r   r   r   r   r\   r   strr   r   r3   r   classmethodr  r
  __static_attributes__r   r   r   r2   r2     s     ,,O%I5<<  , + + DH#!$#59D[#?# ?&ell & $ $L#r   r2   c           	         [         (       a  S nS nOS nS n[        U [        5      (       a  [        U S   [        5      (       ab  U" U 5      n/ nSnU HN  nUR                  U5        US:w  a)  U[	        U5      :w  a  [        SU S[	        U5       35      e[	        U5      nMP     GOU" U 5      n[	        U5      nGO[        U [        R                  5      (       aB  [        R                  " U 5      n[	        U5      nU=(       d    [        R                  " U 5      nGOG[        U [        5      (       al  U R                  S	:w  a  [        S
5      eU R                  5       n[        R                  " U5      n[	        U5      nU=(       d    [        R                  " U5      nO[        U [        5      (       a  [	        U 5      S:X  a  [        U S   [        5      (       at  [        U S	   [         5      (       a\  U S   n	U S	   n
U	R                  U
5      n[        R                  " U5      n[	        U5      nU=(       d    [        R                  " U5      nO[        S5      e[        S5      eXU4$ )a%  
_expand_group desugars the different RANK_TYPES types into a canonical format that is traceable.

By having this be part of the explicit eager codepath, we avoid having to specialize behavior inside
torchdynamo and can still interoperate with processgroup objects or other untraceable forms.
c                 >    [        [        [        [              U 5      $ rn   r   rN   rp   xs    r   cast_listlistint'_expand_group.<locals>.cast_listlistintY  s    T#Y++r   c                 0    [        [        [           U 5      $ rn   r  r  s    r   cast_listint#_expand_group.<locals>.cast_listint\  s    S	1%%r   c                     U $ rn   r   r  s    r   r  r  c      Hr   c                     U $ rn   r   r  s    r   r   r!  f  r#  r   r   z$group sizes must be identical found z and r
   JOnly 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1Dr   1Invalid tuple for group must be (DeviceMesh, int)z[Invalid type for group, must be one of List, Processgroup, DeviceMesh or (DeviceMesh, int).)r   r1   rN   extendrT   
ValueErrorr   ProcessGroupget_process_group_ranksr.   _get_group_tagr   ndimr-   	get_grouptuplerp   )r   r   r  r   nested_listr   r5   rspgdmeshr:   s              r   r   r   M  s    }	,	&		 %eAh%%*51KGJ!r"#
c"g(=$>zl%PSTVPWyY  !W
 " #5)GWJ	E4,,	-	-..u5\
/T((/	E:	&	&::? \  __..r2\
,T((,	E5	!	!J!O58Z0058S))!HE(C%B2226GWJ0,,R0CPQQi
 	
 *%%r   c                 ,   [        U [        R                  5      (       a  U R                  $ [        U [        5      (       a  [        [        R                  U 5      $ [        U [        5      (       a*  U R                  S:w  a  [        S5      eU R                  S   $ [        U [        5      (       ac  [        U 5      S:X  aI  [        U S   [        5      (       a1  [        U S   [        5      (       a  U S   nU S   nUR                  U   $ [        S5      e[        U [         5      (       aU  [#        5       (       d  [$        R&                  " S[(        SS9  [        R*                  " [        [         [           U 5      U5      $ [        S	[-        U 5       S
U  35      e)z3
Given group in RANK_TYPES, return the group name.
r
   r&  r   r   r'  zThe combination of ranks + tag as process group identifier has been deprecated. Please switch to using ProcessGroup, DeviceMesh, or group name instead.   r   zUnsupported group type: z, )r1   r   r*  r$   r  r   r.   	GroupNamer   r-  r-   _dim_group_namesr/  rT   rp   r)  rN   r   warningswarnFutureWarning$_resolve_group_name_by_ranks_and_tagr\   )r   r   r3  r:   s       r   r!   r!     sU    %**++	E3		
 DNNE**	E:	&	&::? \  %%a((	E5	!	!J!O58Z0058S))!HE(C))#..PQQ	E4	 	 '))MMI  88d3i9OQTUU3DK=5'JKKr   c                       \ rS rSrSr\S\R                  S\R                  4S j5       r\S\R                  S\R                  4S j5       r	Sr
g	)
r<   i  za
_FromTorchTensor allows autograd to propagate from a normal Tensor to an
AsyncCollectiveTensor.
r   r*   c                     [        U5      $ rn   )r#   )r   r   s     r   forward_FromTorchTensor.forward  s    
 "%((r   r   c                     U$ rn   r   r   s     r   backward_FromTorchTensor.backward  s    r   r   N)r  r  r  r  r  r  r   r   r>  rA  r  r   r   r   r<   r<     s_    
 )||) 
) ) 5<< ELL  r   r<   z'_c10d_functional::_wrap_tensor_autogradr   z(Tensor input) -> Tensor)mutates_argsrg   r   c                     [        U 5      $ )aO  
Custom op that allows autograd to propagate
from a normal Tensor to an AsyncCollectiveTensor.

This is the low-level implementation. Users should call _maybe_wrap_tensor directly.

Args:
    input: Input tensor to wrap in AsyncCollectiveTensor

Returns:
    AsyncCollectiveTensor wrapping the input (or wait_tensor result if tracing)
)r2   r   s    r   _wrap_tensor_autogradrF    s    $ !''r   c                 .    [         R                  " U 5      $ )z(
Meta kernel for _wrap_tensor_autograd.
r   
empty_likerE  s    r   _rJ    s    
 E""r   c                     U$ )a  
Backward for _wrap_tensor_autograd: identity (no-op).

The wrapping is just for async optimization, gradients flow through unchanged.

Args:
    ctx: Context object (unused)
    grad_output: Gradient from downstream operations

Returns:
    Gradient unchanged (identity)
r   r   s     r   _wrap_tensor_autograd_backwardrL    s
     r   c                     g)z
Setup context for _wrap_tensor_autograd backward.

Args:
    ctx: Context object to save state for backward (nothing to save)
    inputs: Tuple of (input,)
    output: Output from forward pass
Nr   r   s      r   #_wrap_tensor_autograd_setup_contextrN  
  s     r   c                  N   [        5       (       a  g[        R                  R                  [        R                  R                  R
                  5      b  g[        R                  R                  [        R                  R                  R                  5      (       a  g[        5       S L$ r   )
r   r   r]   _get_dispatch_mode_TorchDispatchModeKeyFAKE&_dispatch_tls_is_dispatch_key_includedr`   PythonDispatcherr	   r   r   r   _are_we_tracingrU    ss    !!xx""588#A#A#F#FGSxx66--  4''r   c                 L    [        5       (       a  [        U 5      $ [        U 5      $ rn   )rU  r   rF  r   s    r   r#   r#   *  s!    4   &&r   valuec              #   f  #    [         R                  R                  R                  5       n [         R                  R                  R	                  U 5        Sv   [         R                  R                  R	                  U5        g! [         R                  R                  R	                  U5        f = f7f)a  
Context manager to temporarily set whether inflight collectives are allowed as torch.compile graph inputs.
Common use case is when the collective is issued in eager (with `async_op=True`) but waited in compiled region:
```
def all_reduce_eager(x):
    y = x * x
    req = dist.all_reduce(y, op=dist.ReduceOp.SUM, async_op=True)
    return y


@torch.compile(fullgraph=True)
def all_reduce_wait_compiled(y):
    torch.ops.c10d_functional.wait_tensor(y)
    return y * y


x = torch.ones(1280, 1280, device="cuda") + self.rank
# the context manager ensures that `wait_tensor(y)` will wait on the correct work object
with allow_inflight_collective_as_graph_input_ctx():
    y = all_reduce_eager(x)
    z = all_reduce_wait_compiled(y)
```
With this context manager, when a collective is called, under the hood the work object of the collective
will be registered in the work registry, and the wait_tensor() in compiled region called on
the output tensor of the collective will wait on the correct work object.
N)r   r]   _distributed_c10d)_allow_inflight_collective_as_graph_input-_set_allow_inflight_collective_as_graph_input)rW  previouss     r   ,allow_inflight_collective_as_graph_input_ctxr]  0  sv     8 xx))SSUH
""PPQVW""PP	
""PP	
s   )B1-B *B1+B..B1c                     [        U R                  5       5      n[        U5      S:X  a  UR                  U5        OUS==   U-  ss'   U R	                  U5      nU$ Nr   )rN   rF   rT   append	new_empty)r   r5   out_size
out_tensors       r   _make_all_gather_out_tensorrd  W  sL    EJJL!H
8}
#z!*Jr   c                 D    U  Vs/ s H  n[        XC5      PM     sn$ s  snf rn   rd  )r   r   r   r5   r   s        r   &_all_gather_into_tensor_coalesced_metarg  a  s     @DE1'6EEEs   c                 .    [         R                  " U 5      $ rn   rH  r   r  s     r   _broadcast_metarj  f      D!!r   c                 .    [         R                  " U 5      $ rn   rH  ri  s     r   _all_reduce_metarm  j  rk  r   c                 .    [         R                  " U 5      $ rn   rH  ri  s     r   _wait_tensor_metaro  n  rk  r   c                     [        X5      $ rn   rf  )shardr   r   r5   s       r   _all_gather_into_tensor_metarr  r      &u99r   c                 p    [        U R                  5       5      nUS==   U-  ss'   U R                  U5      $ r_  rN   rF   ra  )r   r   r   r   r5   rb  s         r   _reduce_scatter_tensor_metarv  v  s/    EJJL!HQKJK??8$$r   c                 Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf rn   rH  )r   r  r   s      r   _all_reduce_coalesced_metarx  |  s%    )-.AEQ...s    (c                     U $ rn   r   inpr  s     r   _all_reduce__metar|        Jr   c                     U $ rn   r   rz  s     r   _broadcast__metar    r}  r   c                     U $ rn   r   )rR   r  s     r   _all_reduce_coalesced__metar    s    Mr   c                 J   ^ U4S jnU  Vs/ s H
  oe" U5      PM     sn$ s  snf )Nc                 v   > [        U R                  5       5      nUS==   T-  ss'   U R                  U5      nU$ r_  ru  )r   rb  rc  r5   s      r   mk_out_tensor<_reduce_scatter_tensor_coalesced_meta.<locals>.mk_out_tensor  s5    

%
"__X.
r   r   )rR   r%   r   r   r5   r  r   s       `  r   %_reduce_scatter_tensor_coalesced_metar    s'     '--fM!f---s    c                     Uc  U R                  U R                  5       5      $ U H  n[        R                  " US:  5        M     [	        U R                  5       5      n[        U5      US'   U R                  U5      $ r_  )ra  rF   r   _checkrN   r   )r   rj   rk   r  r  srb  s          r   _all_to_all_single_metar    sg     !uzz|,,#ALLa  $

%,-x((r   c                    [        X5      $ rn   rf  )r   r5   r$   r   s       r   '_all_gather_into_tensor_out_native_metar    rs  r   c                     [        X5      $ rn   rf  )r   r5   r$   s      r   #_all_gather_into_tensor_native_metar    rs  r   c                 F    U  Vs/ s H  n[        X1U5      PM     sn$ s  snf rn   )r  )rR   r5   r$   r   s       r   -_all_gather_into_tensor_coalesced_native_metar    s0     E 	,EzJ     c                 p    [        U R                  5       5      nUS==   U-  ss'   U R                  U5      $ r_  ru  )r{  r   r5   r$   r~   s        r   "_reduce_scatter_tensor_native_metar    s/    E	!HH==r   c                p    [        U R                  5       5      nUS==   U-  ss'   U R                  U5      $ r_  ru  )r{  r   r5   r$   r   r~   s         r   &_reduce_scatter_tensor_out_native_metar    s1     E	!HH==r   c           	      F    U  Vs/ s H  n[        XAX#5      PM     sn$ s  snf rn   )r  )rR   r   r5   r$   r{  s        r   ,_reduce_scatter_tensor_coalesced_native_metar    s0    
 C 	+3:R  r  r   IMPLr'   Metaall_reduce_rM   all_reduce_coalesced_r   all_gather_into_tensor_outr0   rQ   rG   reduce_scatter_tensor_outrW   r   r"   
broadcast_c10d_functionalDEF)	zObroadcast(Tensor self, int src, str tag, int[] ranks, int group_size) -> TensorzUall_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensorzcall_reduce_coalesced(Tensor[] self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]z"wait_tensor(Tensor self) -> TensorzTall_gather_into_tensor(Tensor shard, str tag, int[] ranks, int group_size) -> Tensorzball_gather_into_tensor_coalesced(Tensor[] input, str tag, int[] ranks, int group_size) -> Tensor[]zareduce_scatter_tensor(Tensor input, str reduceOp, str tag, int[] ranks, int group_size) -> Tensorzpreduce_scatter_tensor_coalesced(Tensor[] inputs, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]zall_to_all_single(Tensor input, SymInt[]? output_split_sizes, SymInt[]? input_split_sizes, str tag, int[] ranks, int group_size) -> Tensor(rJ  )tagsra   output_tensorinput_tensorasync_opc                     U(       a  [        S5      eU=(       d    [        R                  R                  nUc  [        S5      eU R	                  [        XX$5      5      $ N@Can't remap async version of inplace op to functional collectivegroup cannot be None)r-   r   r   WORLDcopy_r7   )r  r  r   r  r   r)   s         r   all_gather_tensor_inplacer    sW     N
 	
 %TZZ%%E}34405VWWr   r   r   opc           	          U(       a  [        S5      eU=(       d    [        R                  R                  nUc  [        S5      eU R	                  [        XXSU5      5      $ r  )r-   r   r   r  r  rG   )r   r   r  r   r  rA   r   s          r   reduce_scatter_tensor_inplacer  &  sW     N
 	
 %TZZ%%E}344<<-eSQRRr   avgproductminmaxbandborbxorr   c                     U(       a  [        S5      eU=(       d    [        R                  R                  nUc  [        S5      eU R	                  [        XX$5      5      $ r  )r-   r   r   r  r  r'   )r   r  r   r  r   s        r   all_reduce_inplacer  G  sT     N
 	
 %TZZ%%E}344<<
6u:;;r   c           	          U(       a  [        S5      eU=(       d    [        R                  R                  nUc  [        S5      eU R	                  [        UUUUU5      5      $ r  )r-   r   r   r  r  r   )r   r   rj   rk   r   r  r   s          r   all_to_all_inplacer  Z  sf     N
 	
 %TZZ%%E}344<<	
 r   rI   c                 $  ^ U(       a  [        S5      eTR                  5       S:w  a%  [        U4S jU  5       5      (       d  [        S5      eU=(       d    [        R                  R
                  nUc  [        S5      e[        TSX$5      n/ nSnU  HV  nUR                  5       S:H  n	U	(       a  SOUR                  S5      n
U	(       a  XW   OXWXz-    nUR                  U5        Xz-  nMX     [        X5       H  u  pUR                  U5        M     U $ )Nr  r   c              3   h   >#    U  H'  oR                  S 5      TR                  S 5      :H  v   M)     g7f)r   N)rF   )rs   r   r   s     r   rt   %all_gather_inplace.<locals>.<genexpr>  s$     $V+QVVAY&++a.%@+s   /2z7Remapping variable size all_gather is not yet supportedr  r
   )r-   r:   r}   r   r   r  r7   rF   r`  rV   r  )rI   r   r   r  r   r   output_splitsoffsetr   	is_scalart_offsetr   r   r   s    `            r   all_gather_inplacer  w  s     N
 	
 zz|q$V+$V!V!VVWW%TZZ%%E}344vq%5F MFEEGqL	!1qvvay )fnvv?P/QS!  3		# 4r   )_all_gather_base_reduce_scatter_base
all_gatherr0   r'   r   rG   ) )T)NFr  r   )r   NFr   r  )r   NFr  )NNNFr  )NFr  )
contextlibsysr8  typingr   r   r   r   r   torch.distributeddistributedr   "torch.distributed.distributed_c10ddistributed_c10dr.   torch._utilsr   torch.distributed.device_meshr   "torch.fx.experimental.proxy_tensorr	   r  r   fun_col_impltorch.utils._cxx_pytreer   ImportErrortorch.utils._pytreetorch.compilerr   r   	Exceptionr9  rN   rp   r*  r/  r6  
RANK_TYPESr   r   r  r"   r'   r7   r@   rG   rK   rM   rQ   rW   ri   r   r   r   r   libraryregister_autogradr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r!   autogradFunctionr<   	custom_oprF  register_fakerJ  rL  rN  r  rU  r#   contextmanagerr]  rd  rg  rj  rm  ro  rr  rv  rx  r|  r  r  r  r  r  r  r  r  r  r  Librarylib_implimplfxnodehas_side_effectr   r   r   
legacy_liblegacy_lib_implops_defsmodulesr  	my_moduleop_defindexop_namegetattrbackend_impldefineTagpt2_compliant_tagr  r  ReduceOpSUMAVGPRODUCTMINMAXBANDBORBXORREDUCE_OP_TO_STRr  r  r  r  legacy_all_gather_baser  legacy_reduce_scatter_baser  legacy_all_gatherr0   legacy_allgatherlegacy_allreducelegacy_all_to_all_singlelegacy_reducescattertraceable_collective_remapsr   r   r   <module>r     s    
  2 2    1 1 . 4 = :25

N#J
 IcO	
"C
'(NN
::&ELL &s &: &C &&U\\ &S & &# &4 	&
,,&& & 
	&
 \\&Z 	
,,  
	L (
,,(( ( 	(
 
(` %
,,%% % 	%
 
%R LN6
u||
6(+64>6EH6	%,,6: =?6
u||
6%/6696	%,,6D -6-6-6 c-6 	-6
 
-6 
%,,-6dV* 2&
,,2&S	D(2& Cy4'2& 	2&
 
2& \\2&t '*
,,'*S	D('* Cy4''* 	'*
 
'* \\'*^5<<    #+   +%,, +8
&   "*   +ell +8    .#6   1U\\ 1D&   -"5   1 18.   )1   =T%,,5G =>&   ,!4   =ellAS =8    8-@   CU\\@R CD&   7,?    	V
,,V#YV V 
	V
 \\V>q#ELL q#h
M& M&# M&uS$s)S=P7Q M&`+Lz +L +LT^^ +L\u~~.. $ -%  
( (%,, (
(  $$#U\\ #ell # %#U\\  	  ' '"5 ( ( (' ' #
 #
 #
LF
""":%/.
)::   ==  !3V< l,f 5 m. 7 $&@& I %'BF K m. 7  "I6 	&(KV T &1

 	%'I6 R !G 	%0

 	!#:F C k?F 3 l,f 5   eii88DDLL M   eii88DD E ]]""#4e<
--''(96B
 KK!	FQc*+G<1WI7Lf599#>#>?,0KL	  X<<X,,X 	X
 
X X. 
SLLS<<S 	S
 S S 
S, 	MMuMMuMM9MMuMMuMMMMuMM	  
<LL<< 	<
 
<, 
LL<< 
@ "ell#"LL"
 
"J   /7(0) =5 m4  2112  MMn
s$   d+ d> +d;:d;>ee