
    ȅiB              	          S SK r S SKrS SKrS SKrS SKJr  S SKJrJr  S SK	r	S SK
r
S SKJs  Jr  S SKJr  S SKJr  SSKJr  SSKJrJrJr  SS	KJr  \R4                  " \5      r " S
 S\5      r " S S\5      r\ R>                  S\4S j5       r S\!S\4S jr"S\RF                  S\4S jr$S6S\
RJ                  S\&S\&4S jjr'S6S\
RJ                  S\&S\&4S jjr(S\RF                  S\&4S jr)S\RF                  S\&4S jr* " S S\5      r+ " S S\5      r, " S S\5      r-S /S!//r.S"/S"//S#/S#//S$/S%///r// S&Q/ S'Q/ S(Q/ S)Q/r0S\\1   4S* jr2S+\&S,\&S-\S\14S. jr3S\RF                  S\14S/ jr4S0\
Rj                  Rl                  S\&4S1 jr7S0\
Rj                  Rl                  S\&4S2 jr8  S7S0\
Rj                  Rl                  S3\\&   S4\9S\14S5 jjr:g)8    N)IntEnum)AnyOptional)	size_hint)normalize_function   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                   (    \ rS rSrSrSrSrSrSrSr	g)		NCCL_COLL   r   r             N)
__name__
__module____qualname____firstlineno__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALLUNSUPPORTED__static_attributes__r       W/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/comm_analysis.pyr   r      s    JJNJKr   r   c                   $    \ rS rSrSrSrSrSrSrg)NVIDIA_GPU_TYPE   r   r   r   r   r   N)	r   r   r   r   VOLTAAMPEREHOPPER	BLACKWELLr   r   r   r    r"   r"      s    EFFIr   r"   returnc                    ^  [         R                  R                  R                  [         R                  R                  R                  5      =(       d    Sm ST ;   a  [
        R                  $ ST ;   a  [
        R                  $ ST ;   a  [
        R                  $ [        U 4S jS 5       5      (       a  [
        R                  $ [
        R                  $ )N V100A100H100c              3   ,   >#    U  H	  oT;   v   M     g 7fNr   ).0gpugpu_infos     r    	<genexpr>get_gpu_type.<locals>.<genexpr>/   s     A(@H_(@   )B100B200B300)torchutilscollect_envget_gpu_inforunr"   r$   r%   r&   anyr'   )r2   s   @r    get_gpu_typer?   &   s    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%%	A(@A	A	A((( %%%r   kernel_namec                   ^  T c   eST ;   a  [         R                  $ ST ;   a  [         R                  $ ST ;   a  [         R                  $ [	        U 4S jS 5       5      (       a  [         R
                  $ [         R                  $ )N
all_reduce
all_gatherreduce_scatterc              3   ,   >#    U  H	  oT;   v   M     g 7fr/   r   )r0   commr@   s     r    r3   7get_collective_type_from_kernel_name.<locals>.<genexpr>>   s     H-GT[ -Gr5   )
all_to_allalltoall)r   r   r   r   r>   r   r   )r@   s   `r    $get_collective_type_from_kernel_namerJ   6   su    """{"###		$###	[	('''	H-GH	H	H###$$$r   nodec                     [        U [        R                  5      (       d  [        SU  35      eU R                  nUc   e[        U5      $ )Nz!node is not a collective kernel: )
isinstancer	   _CollectiveKernel
ValueErrorpython_kernel_namerJ   )rK   names     r    get_collective_typerR   D   sJ    dB0011<TFCDD""D/55r   sizefallbackc                     [        U 5      n[        U[        R                  5      (       a  [	        U5      $ [
        R                  R                  R                  X!S9$ )NrT   )	r   rM   sympyIntegerintr   graphsizevarsr   )rS   rT   numels      r    get_ir_node_size_numelr]   M   sE    $E%''5z77%%e%??r   c                 d    [         R                  " [        R                  U S5      n[	        X!S9nU$ )Nr   rV   )	functoolsreduceoperatormulr   )rS   rT   r\   results       r    get_fx_node_size_numelrd   T   s)    X\\43Eu0FMr   c                     SnU R                    HF  n[        UR                  R                  5      nX[	        UR                  R
                  5      -  -  nMH     U$ )Nr   )inputsr]   layoutrS   r
   dtype)rK   sz_bytesinpr\   s       r    get_collective_input_size_bytesrk   Z   sJ    H{{&szz7N3::+;+;<<<  Or   c                     [        U [        R                  5      (       a:  [        U [        R                  5      (       d  SSKJn  U" U R                  S   5      $ [        SU  35      e)Nr   _get_group_size_by_namezUnsupported collective type: )rM   r	   rN   _WaitKernel"torch.distributed.distributed_c10drn   constant_args	TypeError)rK   rn   s     r    get_collective_group_sizert   b   sQ    $,,--jr~~6V6VN&t'9'9"'=>>7v>??r   c                        \ rS rSrSrSrSrSrg)NCCL_HWp   r   r   r   r   N)r   r   r   r   NVLINKPCINETr   r   r   r    rv   rv   p   s    F
C
Cr   rv   c                       \ rS rSrSrSrSrg)	NCCL_ALGOv   r   r   r   N)r   r   r   r   TREERINGr   r   r   r    r|   r|   v   s    DDr   r|   c                       \ rS rSrSrSrg)
NCCL_PROTO{   r   r   N)r   r   r   r   LLr   r   r   r    r   r   {   s	     
Br   r   g333333@gffffff@g333333?      ?g      @g@)     C@r   gffffff4@)gU@g     6@g      3@)g     a@g     F@g     A@)g     q@g     V@g     Q@c                 R   U R                   nUc   e[        USS5      nUR                  S   nSSKJn  U" U5      n[
        R                  R                  U5      n[
        R                  " SU 35      n[        U5      n[        U 5      u  pSU;   a  U	SS  U	S   -   n	[
        R                  R                  XWS	9 nU" U	0 U
D6n[
        R                  R                  R                  R                  U5        S S S 5        WR                   nUS:  a  g US
-  nU$ ! , (       d  f       N(= f)NrP   r*   ro   r   )_resolve_process_groupzcuda:all_gather_into_tensor_outr   )groupdevice     @@)rK   getattrrr   rq   r   r9   distributedget_rankr   evalr   _time_estimatorops_c10d_functionalwait_tensordefaultestimated_time)snodekernelpy_kernel_namepg_namer   pgrankr   fnargskwargstime_estimatorwest_time_usest_time_mss                  r    /estimate_nccl_collective_runtime_nccl_estimatorr      s"   ZZFV%92>N""2&GI		(B!!**2.D \\E$.)F	n	B$U+LD $~5ABx$q'!				*	*	*	C~		""..66q9 
D !//K Q#K 
D	Cs   :<D
D&tensor_storage_size_bytes
group_sizecollc                    U S-  S-  S-  nSn[         R                  " X-  5      nUnUS::  a  g[        R                  n[        R
                  n[        R                  R                  R                  n	[        R                  R                  R                  n
[        5       nUS::  a  US-
  OSnUS:X  a  UOSn[        U   U   nUS:X  a  U	OU
nSnUU-  n[        UUUS:  d  U[        R                  :X  a  SOS-  5      nU[        R                  :X  a	  SUS-
  -  nOFU[        R                   :X  a	  SUS-
  -  nO)U[        R"                  [        R$                  4;   a  US-
  nSU-  W-  nUU-  nUS	-  n[&        R(                  nU[        R                  :X  a  US:  a  SU-  nO;SnO8U[        R"                  [        R$                  [        R                   4;   a  US-
  n[*        U   U   n[,        U   U   U   n[,        [&        R.                     U   U   nS
nUS:  a  Sn[1        UU5      nUUW-
  U-  UU-  -   -  nUS-  nUU-  nUU-   nUS-  nU$ )a  
Returns estimated NCCL collective runtime in milliseconds (ms).

The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
We aim to estimate the runtime as accurately as possible.

Assumptions:
- only ring algorithm (NCCL_ALGO_RING) is used
- only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
- 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
- collective is one of: allreduce, reducescatter, allgather
i      r   r   r   g      ?gUUUUUU?r   g    eAg        r   g    .A)mathceilr|   r   r   r   r9   	_inductorconfigintra_node_bwinter_node_bwr?   llMaxBwsminr   r   r   r   r   rv   rx   baseLathwLatrz   max) r   r   r   tensor_storage_size_GBnum_gpus_per_nodenNodesnRanks	nccl_algo
nccl_protobwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss                                    r    %estimate_nccl_collective_runtime_implr      s     7=DtK YYz56FF{ IJ
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	%%	%fqj!	)**I,@,@A	A! 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@)BVBVW	Wqj i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L	
	"B	cBIr   c                 \    [        U 5      n[        U 5      n[        U 5      n[        XU5      $ )  
Returns estimated NCCL collective runtime in nanoseconds (ms).

The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
We aim to estimate the runtime as accurately as possible.

Assumptions:
- only ring algorithm (NCCL_ALGO_RING) is used
- only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
- 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
- collective is one of: allreduce, reducescatter, allgather
)rk   rt   rR   r   )rK   r   r   r   s       r     estimate_nccl_collective_runtimer   R  s6     !@ E*40Jt$D0!t r   fx_nodec                   ^^ SmU R                   U R                  p![        U5      nUR                  SS5        S[        R
                  S[        4S jmS[        R                  R                  4UU4S jjn[        R                  " [        R                  R                  UX45        U R                  R                  SS5      nTb  [        U[        R
                  5      (       d  g	T" U5      nTU-   $ )
zSEstimate the size of a collective operation in bytes, including inputs and outputs.Nouttr(   c                 `    [        U R                  5       5      [        U R                  5      -  $ r/   )rd   rS   r
   rh   )r   s    r    tensor_bytes1estimate_fx_collective_size.<locals>.tensor_bytesq  s!    %affh/.2IIIr   rj   c                    > U R                   R                  SS 5      n[        U[        R                  5      (       d  g Tc  SmTT" U5      -  mg )Nvalr   )metagetrM   r9   Tensor)rj   inp_valinput_bytesr   s     r    add_inp_bytes2estimate_fx_collective_size.<locals>.add_inp_bytest  sG    ((,,ud+'5<<00 K|G,,r   r   r   )r   r   dictpopr9   r   rY   fxNodepytreetree_map_onlyr   r   rM   )r   r   r   r   
output_valoutput_bytesr   r   s         @@r    estimate_fx_collective_sizer   g  s    K<<&&\F JJudJ J J-588== - - 	 !!%.J*Z"F"F
+L%%r   c                 L    SSK Jn  [        U 5      nU" U 5      (       d  U$ US-  $ )zEstimate the memory footprint of a collective operation in bytes.

This returns the total bytes that need to be live concurrently in memory.
For all_reduce, we divide by 2 since it can be done in-place.
r   )is_all_reduce_tensorr   )#torch._inductor.fx_passes.bucketingr   r   )r   is_all_reducerS   s      r    'estimate_fx_collective_memory_footprintr     s,     'w/D$W--4<419<r   override_sizeuse_nccl_estimatorc                   ^ ^^
^^ SSK Jn  Tc  [        T 5      nOTn[        T R                  [
        5      (       a   e[        T R                  T R                  T R                  SS9nUc   eUu  m
mTS   mU" T5      n[        T R                  [        R                  R                  5      (       d   e[        T R                  R                  5       5      nS[        [           4U
U UUU4S jjnU(       a  U" 5       n	U	b  U	$ [!        XFU5      $ )r   r   rm   T)r   r   normalize_to_only_use_kwargs
group_namer(   c                    >^^ SSK Jn Jn  U" T5      n[        R                  R
                  R                  U5      S:X  a  g U " U5      nUR                  U5      nUR                  (       d  g [        R                  " TT45      u  pVS[        R                  4U4S jjmS[        R                  S[        4S jnS[        S[        4UU4S	 jjmU Vs/ s H  nT" U5      PM     nn[        R                   " XV5      u  pTR"                  n[%        U[        R&                  R(                  5      (       d   e[        R                  R+                  US
9 nU" U	0 U
D6n[        R,                  R.                  R0                  R3                  U5        S S S 5        WR4                  nUS:  a  g US-  nU$ s  snf ! , (       d  f       N-= f)Nr   )_get_pg_default_devicer   faker(   c                 <   > [         R                  " Tc  U OT/UUS9$ )N)rh   r   )r9   empty)rS   rh   r   r   s      r    _tensorVestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>._tensor  s&    ;;%-M? r   sc                 R    [         R                  R                  R                  U SS9$ )Nr   rV   )r   rZ   r[   r   )r   s    r    try_size_hint\estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.try_size_hint  s"    77##--a!-<<r   ec                 .  > [        U [        R                  R                  5      (       a  T" U R                  S   5      $ [        U [        R
                  5      (       a6  T" [        U R                  5       5      /U R                  U R                  5      $ U $ )Nr   )
rM   r9   r   r   r   r   rd   rS   rh   r   )r  r   to_real_tensors    r    r  ]estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.to_real_tensor  sg    !UXX]]++%affUm44!U\\** 6qvvx @A177AHHUUHr   )r   r   )rq   r   r   r9   r   distributed_c10dget_backend_get_backendsupports_time_estimater   tree_flattenr   rW   ExprrY   r   tree_unflattentargetrM   _ops
OpOverloadr   r   r   r   r   r   )r   r   r   r   backend	flat_argsflat_args_pytree_specr  a	real_argsreal_kwargsr   r   r   r   r   r   r  r   r   r   r   r   s                   @@r    _nccl_estimateEestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate  s   	

 $J/--99"=G'+//&)--+1+>+>f~+N(		ELL 		=UZZ 	=C 	=	c 	c 	 	 1::	1^A&		:!'!6!6y!X	^^"ejj334444..R.8NI--AII&&22::1= 9 %33 ?!C' ;
 98s   F1<F66
G)rq   rn   r   rM   r  strr   r   r   r9   r  r  rJ   rQ   r   floatr   )r   r   r   rn   r   opt_args_kwargsr   r   r  r   r   r   r   s   ``        @@@r    -estimate_nccl_collective_runtime_from_fx_noder    s    " K$?$H!$1!'..#....(\\~~%)	O &&&"LD&%J(4Jgnnejj&;&;<<<</0C0C0EFD2HUO 2 2h $&"0!t r   )i   )NT);r_   loggingr   ra   enumr   typingr   r   rW   r9   torch.utils._pytreer:   _pytreer   %torch.fx.experimental.symbolic_shapesr   torch.fx.operator_schemasr   r*   r	   r
   r   r   virtualizedr   	getLoggerr   logr   r"   	lru_cacher?   r  rJ   IRNoderR   SizerY   r]   rd   rk   rt   rv   r|   r   r   r   r   r  r   r   r   r   r   r   r   boolr  r   r   r    <module>r+     s{            $ $ ; 8  C C  ! g  &o & &%c %i %6bii 6I 6@ @s @S @ s S "))  @BII @# @g  
  	
 		  
	 
	 
		,)8huo Be"e03e;De
eZ299  *$& $&3 $&N=UXX]] =s =  $(#bXX]]bC=b b 	br   