
    ȅiR                       S SK Jr  S SKrS SKrS SKrS SKrS SKJrJrJ	r	J
r
  S SKrS SKJr  S SKJr  S SKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  \(       a  S SKJr  SSKJr  SSK J!r!J"r"  SSKJ#r#  \RH                  " \%5      r&\RN                   " S S5      5       r(\RN                   " S S5      5       r)\RN                   " S S5      5       r*\RN                   " S S5      5       r+      S+S jr,    S,S jr-      S-S jr.          S.S jr/\RN                   " S S5      5       r0        S/S jr1        S0S jr2\RN                   " S  S!5      5       r3        S1S" jr4          S2S# jr5S3S$ jr6S3S% jr7S4S& jr8        S5S' jr9            S6S( jr:\5\6\7/4             S7S) jjr;            S8S* jr<g)9    )annotationsN)OptionalTYPE_CHECKING	TypedDictUnion)	is_fbcode)signpost_event)
OrderedSet   )config)MultiOutputLayout
NoneLayout)get_dtype_sizeis_nonfreeable_buffers)V)Callable)Dep)BaseSchedulerNodeSchedulerBuffer)WeakDepc                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg	)
PeakMemoryResult    list[BaseSchedulerNode]orderintpeak_memorystrmethod N__name__
__module____qualname____firstlineno____annotations____static_attributes__r        P/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/memory.pyr   r       s    ""Kr(   r   c                      \ rS rSr% SrS\S'   SrS\S'   \R                  " \	S9r
S\S'   \R                  " \	S9rS\S	'   SS
 jrSrg)MemoryPlanningInfoForBuffer'   r   r   
size_alloc	size_freedefault_factoryOrderedSet[BaseSchedulerNode]
succ_nodessucc_nodes_for_orderingc                   ^  [         R                  " [        T R                  5      [        T R                  5      :*  U 4S j5        g )Nc                 `   > S[        T R                  5       S[        T R                  5       3$ )NzHsucc_nodes must be a subset of succ_nodes_for_ordering. len(succ_nodes)=z, len(succ_nodes_for_ordering)=)lenr2   r3   selfs   r)   <lambda>;MemoryPlanningInfoForBuffer.__post_init__.<locals>.<lambda>7   s2     "4??344STWX\XtXtTuSvxr(   )torch_checkr6   r2   r3   r7   s   `r)   __post_init__)MemoryPlanningInfoForBuffer.__post_init__4   s1     C(D(D$EEx	
r(   r    N)returnNone)r"   r#   r$   r%   r-   r&   r.   dataclassesfieldr
   r2   r3   r=   r'   r    r(   r)   r+   r+   '   sU    JIs0;0A0A"1J-  >I=N=N">: 
r(   r+   c                      \ rS rSr% SrS\S'   SrS\S'   \R                  " \	S9r
S\S'   \R                  " \	S9rS	\S
'   \R                  " \	S9rS	\S'   Srg)MemoryPlanningInfoForNode<   r   r   indexsizer/   z7OrderedSet[Union[SchedulerBuffer, FreeableInputBuffer]]pred_buffersr1   
pred_nodesr2   r    N)r"   r#   r$   r%   rF   r&   rG   rA   rB   r
   rH   rI   r2   r'   r    r(   r)   rD   rD   <   si    E3ND#M*5 I  1<0A0A"1J-  1<0A0A"1J- r(   rD   c                  ^    \ rS rSr% S\S'   \R                  " \S9rS\S'   SS jr	SS jr
S	rg
)FreeableInputBufferK   r   namer/   r+   
mpi_bufferc                    U R                   $ NrM   r7   s    r)   get_nameFreeableInputBuffer.get_nameR   s    yyr(   c                ,    [        U R                  5      $ rP   )hashrM   r7   s    r)   __hash__FreeableInputBuffer.__hash__U   s    DIIr(   r    N)r?   r   )r?   r   )r"   r#   r$   r%   r&   rA   rB   r+   rN   rR   rV   r'   r    r(   r)   rK   rK   K   s,    
I.9.?.?3/J+ r(   rK   c           
     l   SS jn[         R                  " [        5      n[         R                  " [        5      n[        5       nU  H  nUR                  R
                   H  nUR                  U;   d  M  [        U5      (       a  M'  XGR                     R                  U5        U" U5      XWR                  '   [        U[        5      (       a  UR                  (       a  M  X7R                     R                  U5        M     M     [        5       nU H   n	[        U	[        XY   X9   XI   S95      X'   M"     U$ )z
Create and keep track of all input buffers that can be freed during the program

Returns:
    A dictionary containing all freeable input buffers, keyed by their names.
c                @    [         R                  R                  U 5      $ rP   )r   graphget_dep_size_hint)deps    r)   _dep_size_hint.get_freeable_input_buf.<locals>._dep_size_hintd   s    ww((--r(   )r.   r2   r3   )r\   r   r?   r   )collectionsdefaultdictr
   dictread_writesreadsrM   r   add
isinstancer   is_fakerK   r+   )
nodesgraph_inputsr]   dep_name_to_succ_nodes#dep_name_to_succ_nodes_for_orderingdep_name_to_sizenoder\   name_to_freeable_input_bufdep_names
             r)   get_freeable_input_bufro   Y   s   . 	
+  	
+ ( (,v##))Cxx<'-c22 8AEEdK1?1D$XX.&sG44.xx8<<TB *  BF7/B'*41;(K(U0
", 8 &%r(   c                   ^^^^ SSK Jm  SSKJm  [	        5       m S     SUUUU4S jjjmU R                  5        H!  nUR                  5       T;  d  M  T" U5        M#     T$ )a  
Compute the size of each scheduler buffer, including (1) memory allocated when
it is created and (2) memory deallocated when it is freed.

We specially handle the case of MultiOutputLayout.
Consider the following case:
    buf0 = some_ops_with_multi_outputs(...)
    buf1 = buf0[0] # assume 10 bytes
    buf2 = buf0[1] # assume 20 bytes
In such cases,
    buf0: at creation, 30 bytes allocated, when deleted, 0 bytes freed
    buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
    buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed

When an operation mutates a buffer in-place, the scheduler creates a new buffer name
to track the "before" and "after" states, even though they share the same memory.

The mutated buffer represents a rename with zero allocation and deallocation cost.
During dependency tracking, we transfer dependencies from the mutated name back to
the original buffer, ensuring the original memory is only freed when all aliases
are done.

This handles cases where a buffer has multiple non-overlapping aliases - rather than
trying to assign free costs to individual aliases, we forward all alias dependencies
to the original buffer.

Consider:
    buf0 = op0()
    buf1 = mutation_op_(buf0)
    del buf0
    ...
    op(buf1)
    del buf1

The only memory events are the creation prior to op0, and the deletion following buf1.

Returns:
    A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
r   )MultiOutput)
OutputNodec                  > U R                  5       [        R                  R                  R                  ;   a  ST	U R                  5       '   g[        U R                  R                  [        5      (       a  ST	U R                  5       '   g[        U R                  R                  [        5      (       a  SnU R                   Hj  n[        UR                  T5      (       a  M   UR                  R                  5        H,  n[        UR                  T5      (       d  M   UT" US5      -  nM.     Ml     U(       a  SOUS4T	U R                  5       '   U$ [        R                  R                  R                  U R                  R                  5       SS9[        U R                  R!                  5       5      -  nU(       a  SOUU4T	U R                  5       '   U$ )N)r   r   r   T)fallback)rR   r   rZ   	schedulermutation_real_namere   rl   layoutr   r   usersget_outputssizevars	size_hint	get_numelr   	get_dtype)
	sched_bufuser_of_MultiOutputLayoutr-   userbufbuf_sizerq   rr   _compute_and_update_buf_sizesched_buf_to_sizes
         r)   r   Gcompute_size_for_scheduler_buffer.<locals>._compute_and_update_buf_size   s    177#4#4#G#GG6<i0023	--z::6<i0023	--/@AAJ!dii4499002C!#((K88"&B3&MM
 3 ( /J7i0023 ww''11((*Q 2 y~~779:;H /H7i0023 Or(   )F)r~   r   r   boolr?   r   )irrq   ru   rr   ra   valuesrR   )name_to_bufr~   rq   rr   r   r   s     @@@@r)   !compute_size_for_scheduler_bufferr      sw    T  %48F GL"?C	 @ !'')	 '88(3	 * r(   c                   [        U5      n[        R                  " [        5      n[        R                  " [        5      nU  Hx  nUR                   He  nXFR
                     R                  U5        [        U[        5      (       a  UR                  (       a  MH  X6R
                     R                  U5        Mg     Mz     [        [        R                  R                  R                  R                  5       5       H"  u  pxX8==   UU   -  ss'   XH==   XG   -  ss'   M$     U H$  n	[!        X)   S   X)   S   X9   XI   S9X   l        M&     g)z
For each SchedulerBuffer, assign its size info and successor nodes.
A buffer's successor nodes determines when a buffer can be freed.
r   r   )r-   r.   r2   r3   N)r   r_   r`   r
   unmet_dependenciesrM   rd   re   r   rf   reversedr   rZ   ru   rv   itemsr+   rN   )
rg   r   r   ri   rj   rl   r\   mutating_buf_namereal_buf_namebuf_names
             r)   1assign_memory_planning_info_for_scheduler_buffersr      s6    :+F
 	
+  	
+ ( **C 09==dCsG,,&xx044T: +  -5	,,224-( 	-1G2
 	
- 	,:/B	
:-  +F(215'1!4-7$G$Q	,
(  r(   c           	        [         R                  " [        5      n0 n[         R                  " [        5      nU  H  n[        S UR                  5        5       5      nXU'   U H  n	XI   R	                  U5        M     UR                  5        H3  n
U
R
                  R                   H  n	Xi   R	                  U
5        M     M5     M     UR                  5        H3  nUR
                  R                   H  n	Xi   R	                  U5        M     M5     [        U 5       He  u  p[        S UR                  5        5       5      nXW   nXG   nUR                  U5        UR                  U5        [        UUXg   XG   US9Ul        Mg     g)zD
Assign to each scheduler node its predecessor and successor nodes.
c              3  b   #    U  H%  nUR                   R                    H  nUv   M	     M'     g 7frP   )rN   r3   ).0buffer	succ_nodes      r)   	<genexpr>Bassign_memory_planning_info_for_scheduler_nodes.<locals>.<genexpr>'  s0       
,#..FF	 F ,s   -/c              3  L   #    U  H  oR                   R                  v   M     g 7frP   )rN   r-   )r   r   s     r)   r   r   ?  s     WDV&**55DV   "$)rF   rG   rH   rI   r2   N)r_   r`   r
   ry   rd   rN   r2   r   	enumeratesumdiscardrD   mpi_node)rg   name_to_fused_noder   rm   node_to_pred_nodesnode_to_succ_nodesnode_to_pred_buffersrl   r2   r   r   freeable_bufferrF   r-   rI   s                  r)   /assign_memory_planning_info_for_scheduler_nodesr     s|    	
+  RT 	
+ 
   
**, 
 


 $.4  $I)--d3 $ &&(F#..99	$/33F; : ) & 6<<>(33>>I +//@ ? ?
 !'WDDTDTDVWW
'-
'-
 	4 4 1-3)/!
 (r(   c                  H    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S'   S	rg
)
BufferInfoiQ  z+Union[SchedulerBuffer, FreeableInputBuffer]r   r   r-   r.   
start_stepend_stepr    Nr!   r    r(   r)   r   r   Q  s    77ONOMr(   r   c                  ^ [        U 5       VVs0 s H  u  p4XC_M	     snnm/ n0 n    SU4S jjnUR                  5        He  u  pSn
X;  a  U" U	5      u  pUc   eXU	'   UR                  [        U	U	R                  R
                  U	R                  R
                  SU
5      5        Mg     [        U 5       H  u  p4UR                  5        H  nUR                  5       nSn
X;  a   U" U5      u  pU
S:X  a  Un
XFU'   O	Uc   eXU'   UR                  [        UUR                  R                  UR                  R
                  UU
5      5        M     M     UTU4$ s  snnf )z^
Compute buffer allocation and deallocation sizes and map their
lifetime to the node schedule
c                   > SnS nU R                   R                  nU(       a  U H  nTU   nXQ:  d  M  UnUnM     Uc   eX4$ )N)rN   r2   )r   max_stepmax_step_snoder2   r   stepnode_to_steps         r)   _get_end_step_and_snode8compute_memory_timeline.<locals>._get_end_step_and_snodet  s[     6:^^..
'	#I.?#H%.N	 (
 "---''r(   r   r   )r   z+Union[FreeableInputBuffer, SchedulerBuffer]r?   z'tuple[int, Optional[BaseSchedulerNode]])	r   r   appendr   rN   r.   ry   rR   r-   )rg   rm   graph_outputsr   rl   buf_info_listbuf_to_snode_last_user   r   	input_bufr   end_step_snoder~   r   s                @r)   compute_memory_timeliner   Z  s   " &/u%52%5zt
%52L
 ')M 	 (8(	0(   :??A('>y'I$H!---/=),$$..$$..	
  B$  &
))+I !))+HH,+B9+M(r>#H7;)4)5557E)4  ((33((22 , '4 ,(===M2s   Ec                   [        XU5      u  n  n[        [        U 5      S-   5       Vs/ s H  nSPM     nnU HF  nXVR                  ==   UR                  -  ss'   XVR
                  S-   ==   UR                  -  ss'   MH     SnSn/ n	[        [        U 5      S-   5       H&  n
XU
   -  nU	R                  U5        [        Xx5      nM(     Xy4$ s  snf )z
Given a list of nodes in their execution order, estimate the peak memory, by
keeping track of the liveliness of SchedulerBuffers and FreeableInputBuffers.

Returns:
    int: peak memory
    List[int]: memory usage at each node (or each step).
r   r   )	r   ranger6   r   r-   r   r.   r   max)rg   rm   r   r   _memorybuf_info
max_memory
cur_memorymemories_at_nodests              r)   estimate_peak_memoryr     s     2=M1a
 s5zA~./.Aa.F/ """#x':'::#  1$%););;% "
 JJ3u:>"Qi
  ,0
 #
 **! 0s   Cc                  *    \ rS rSr% S\S'   S\S'   Srg)SNodeMemoryi  r   r-   r.   r    Nr!   r    r(   r)   r   r     s    ONr(   r   c                |   [        XU5      u  p4n[        [        U 5      5       Vs/ s H  n[        SS5      PM     nnU Hk  nXgR                     =R
                  UR
                  -  sl        UR                  S:w  d  M@  XgR                     =R                  UR                  -  sl        Mm     0 n[        U 5       H  u  pXi   X'   M     SnSn/ n[        [        U 5      5       HJ  nXn   R
                  nXn   R                  nX-  nUn[        X5      nUU-  nUnUR                  UU45        ML     UUUU4$ s  snf )a  
Alternative version of estimate_peak_memory, that respects the fact,
that every SchedulerNode has multiple phases:
1. alloc ( outputs )
2. run_kernel
3. dealloc last_use buffers
estimate_peak_memory collapses memory into one value: size_alloc - size_free
While peak memory happens after alloc.

Duplicating the code to not migrate all callsites at once,
In future usages of estimate_peak_memory will migrate to this version.
r   r   )r   r   r6   r   r   r-   r   r.   r   r   r   )rg   rm   r   r   r   r   step_idx_allocfreer   snodes_allocfreeirl   r   r   snodes_curr_memoryr   allocfree
post_alloc	post_frees                      r)   estimate_peak_memory_allocfreer     sZ   . /F=/+M+
 6;3u:5FG5F+a+5FG "../::h>Q>QQ:"001;;x?Q?QQ; "
 U#!3!6 $ JJ3u:"%00!$..

0
d
	!!:y"9:  		 3 Hs   D9c                  ^ ^^  " S S[         5      n " S S[         5      n[        5       m[        5       n[        5       nT  HG  n[        UR                  R
                  5      SS.TU'   TU   S   S:X  d  M6  UR                  U5        MI     [        UR                  5       5      [        UR                  5       5      -    H?  n	S[        U	R                  R                  5      U	R                  5       U;   a  S	OS-   0Xi'   MA     [        S
 UR                  5        5       5      n
SnU HG  nX;   a  XU   R                  R                  -  nM%  X;   d  M,  XU   R                  R                  -  nMI     [        X5      nX-
  mT  H  nUR                  R                   H4  n	Xi   S   S	:X  d  M  TU   S==   U	R                  R                  -  ss'   M6     UR!                  5        H4  n	Xi   S   S:X  d  M  TU   S==   U	R                  R                  -  ss'   M6     M     / n["        R$                  nSnU[        T 5      :  Ga  U(       Ga  US:  a%  ['        S U 5       5      U:  a  ['        UU 4S jS9nO['        UUU4S jS9nUR)                  U5        UR+                  U5        US	-  nU
UR                  R,                  -  n
[        X5      nU
TU   S   -  n
X-
  mUR                  R                   H@  nTU   S   S:  d   eTU   S==   S	-  ss'   TU   S   S:X  d  M/  UR                  U5        MB     UR                  R                   Hm  n	Xi   S   S:  d   eXi   S==   S	-  ss'   Xi   S   S	:X  d  M,  U	R                  R                   H'  nTU   S==   U	R                  R                  -  ss'   M)     Mo     U[        T 5      :  a
  U(       a  GM  U[        T 5      :  a  [/        S5      eU$ )a  
A bfs-based greedy topological order. LPMF stands for "Least Peak Memory First".

The idea is from this paper:
Buffer memory optimization for video codec application modeled in Simulink
https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF

The algorithm maintains the max memory so far.
At every iteration, for each scheduleable node, it computes:
    - how much memory needs to be allocated for the output buffers of this node;
    - how much memory can be freed as a result of executing this node.
This gives us two values for each node:
    (1) mem1: memory during the execution of the node;
    (2) mem2: memory after executing the node, after some input buffers are freed.
The greedy approach select as follows:
    (i) if there are nodes whose mem1 values are below the max memory so far,
        then pick the node with the lowest mem2 value;
    (ii) otherwise, pick the one with the lowest mem1 value.
c                  *    \ rS rSr% S\S'   S\S'   Srg)'topological_sort_lpmf.<locals>.NodeInfoi6  r   indegreememory_to_freer    Nr!   r    r(   r)   NodeInfor   6  s    r(   r   c                       \ rS rSr% S\S'   Srg))topological_sort_lpmf.<locals>.BufferInfoi:  r   	outdegreer    Nr!   r    r(   r)   r   r   :  s    r(   r   r   )r   r   r   r   r   c              3  N   #    U  H  nUR                   R                  v   M     g 7frP   rN   r.   )r   r   s     r)   r   (topological_sort_lpmf.<locals>.<genexpr>S  s#      <I 	&&<   #%r   c              3  L   #    U  H  oR                   R                  v   M     g 7frP   )r   rG   )r   rl   s     r)   r   r   w  s     E3D4MM&&3Dr   c                `   > [        S U R                  R                   5       [        T5      S9$ )Nc              3  N   #    U  H  nUR                   R                  v   M     g 7frP   r   rF   )r   r   s     r)   r   :topological_sort_lpmf.<locals>.<lambda>.<locals>.<genexpr>|  s#      )AI "**00)Ar   )default)minr   r2   r6   )rl   rg   s    r)   r9   'topological_sort_lpmf.<locals>.<lambda>{  s*    )-)A)A  J"r(   keyc                   > U R                   R                  T:  a  U R                   R                  OSU R                   R                  TU    S   -
  U R                   R                  4$ )Nr   r   )r   rG   rF   )rl   
memory_gap	node_infos    r)   r9   r     sQ    *.--*<*<z*IDMM&&qMM&&49I)JJMM''"r(   z4Failed to schedule, while loop ran too long for lpmf)r   ra   r
   r6   r   rI   rd   listr   rN   r2   rR   r   r.   r   rH   ry   r   &size_threshold_for_succ_based_strategyr   remover   rG   RuntimeError)rg   rm   r   r   r   r   r   nodes_to_schedulerl   r   live_memoryoutput_memoryr   r   schedulesize_threshold	num_itersselected_noder   r   r   s   `                  @@r)   topological_sort_lpmfr     s   49 Y  486INRfH 8B|DMM445
	$ T?:&!+!!$'  K&&()D1K1R1R1T,UUS^^667LLNm3q<
 V  3::< K M!"2==GGGM3ALLVVVM	 "
 [0J)J ==--C}[)Q.$ 01S^^5M5MM1 . ##%C}[)Q.$ 01S^^5M5MM1 &  )+HBBNI
c%j
 %6 QE3DEEV!	M  !M 	  /&Q	 	}--2221
y/0@AA-
 '//::IY'
3a777i ,1,#J/14!%%i0	 ; !))66C=-111M+&!+&}[)Q.!$!:!:Ii()9:cnn>V>VV: ";	 7W c%j
 %6%6d 3u:QRROr(   c           	     F  ^
  " S S[         5      n[        5       m
[        R                   " S S5      5       nSU
4S jjn/ nU  HY  n[	        UR
                  R                  5      SS.T
U'   T
U   S   S	:X  d  M6  [        R                  " XB" U" U5      U5      5        M[     / nS	nU[	        U 5      :  a  U(       a  [        R                  " U5      R                  n[	        U5      T
U   S
'   UR                  U5        US-  nUR
                  R                   HS  n	T
U	   S   S	:  d   eT
U	   S==   S-  ss'   T
U	   S   S	:X  d  M/  [        R                  " UU" U" U	5      U	5      5        MU     U[	        U 5      :  a	  U(       a  M  U[	        U 5      :  a  [        S5      eU$ )a  
A BFS topological sort that selects nodes whose dependencies are executed the
earliest. This follows a FIFO idea. Specifically, at every iteration, for each node
that is schedulable, we gather the order in which its predecessor nodes are executed,
and this sorted list of execution orders of predecessor nodes defines the priority.
We select the node whose predecessors nodes are executed the earliest. The FIFO
idea aims to reduce the liveness duration of buffers created.
c                  *    \ rS rSr% S\S'   S\S'   Srg)&topological_sort_bfs.<locals>.NodeInfoi  r   r   r   r    Nr!   r    r(   r)   r   r     s    
r(   r   c                  4    \ rS rSr% S\S'   S\S'   S	S jrSrg)
.topological_sort_bfs.<locals>.NodeWithPriorityi  	list[int]priorityr   rl   c                    U R                   UR                   :X  aA  U R                  R                  R                  UR                  R                  R                  :  $ U R                   UR                   :  $ rP   )r   rl   r   rF   )r8   others     r)   __lt__5topological_sort_bfs.<locals>.NodeWithPriority.__lt__  sP    }}.yy))//%**2E2E2K2KKK==5>>11r(   r    N)r   NodeWithPriorityr?   r   )r"   r#   r$   r%   r&   r   r'   r    r(   r)   r  r     s    	2r(   r  c                   > TU    S   S:X  d   e[        [        U4S jU R                  R                   5       5      5      nU$ )Nr   r   c              3  4   >#    U  H  nTU   S    v   M     g7f)r   Nr    )r   	pred_noder   s     r)   r   ?topological_sort_bfs.<locals>._node_priority.<locals>.<genexpr>  s      ?W)	)$W-?Ws   )sortedr
   r   rI   )rl   exec_ordersr   s     r)   _node_priority,topological_sort_bfs.<locals>._node_priority  sK    z*a/// ?C}}?W?W 

 r(   r   )r   r   r   r   r   r   z3Failed to schedule, while loop ran too long for bfs)rl   r   r?   r   )r   ra   rA   	dataclassr6   r   rI   heapqheappushheappoprl   r   r2   r   )rg   r   r  r	  r   rl   r   r   r   r   r   s             @r)   topological_sort_bfsr    s   9  486I2 2 2 13'*4==+C+C'DrR	$T?:&!+NN!#3N44H$#O  )+HI
c%j
 %6&78==,/M	- )&Q	 '//::IY'
3a777i ,1,#J/14%$^I%>	J	 ; c%j
 %6%6" 3u:PQQOr(   c                ~  ^^^^^ [        5       m[        5       m/ m[        5       mSUUUUU4S jjmU  H  nUR                  5        H  nUTU'   M
     M!     U  HC  nUR                  R                  [        S UR                  R                   5       5      -   TU'   ME     [        U U4S jS9 H  nT" U5        M     T$ )a  
This is a DFS topological sort. The setup is similar to `topological_sort_schedule`
in scheduler.py. The difference is the order nodes are visited in the outer loop.
In `topological_sort_schedule`, nodes are visited in their original order.
In this function, nodes are visited based on their priority -- for each node, we
compute the total memory of all buffers it reads from or writes to, and we visit
the nodes in ascending order of this priority.
c                  > U T;  a{  TR                  U 5        U R                   Vs/ s H$  nUR                  T;   d  M  TUR                     PM&     nn[        UU4S jS9 H  nT" U5        M     TR	                  U 5        g g s  snf )Nc                :   > TU    U R                   R                  4$ rP   r   nsize_with_readss    r)   r9   5topological_sort_dfs.<locals>.visit.<locals>.<lambda>
  s    /!*<ajj>N>N)Or(   r   )rd   r   rM   r  r   )	r  r\   	dep_nodesrl   name_to_noderesultseenr  visits	       r)   r  #topological_sort_dfs.<locals>.visit  s    D=HHQK ///C88|+ 'SXX&/  
 O d MM! s
   BBc              3  L   #    U  H  oR                   R                  v   M     g 7frP   r   )r   pred_bufs     r)   r   'topological_sort_dfs.<locals>.<genexpr>  s      9
:Th)):Tr   c                :   > TU    U R                   R                  4$ rP   r   r  s    r)   r9   &topological_sort_dfs.<locals>.<lambda>  s    _Q-?AQAQ,Rr(   r   )r  r   r?   r@   )r
   ra   get_buffer_namesr   rG   r   rH   r  )rg   rl   rM   r  r  r  r  r  s      @@@@@r)   topological_sort_dfsr#    s     +5,D15L&(F48FO  ))+D!%L ,   $ 2 2S 9
:>--:T:T9
 6
 !
  u"RSd T Mr(   c                   ^^^^^ Su  nmm[         R                  X5      m/ mSUUUUU4S jjmU  H  nTU   U:X  d  M  T" U5        M     g)z
Validate that the graph is acyclic by checking predecessor relationships.

Raises:
    RuntimeError: If a cycle is detected in the graph
)r   r      c                  > TU    T:X  a  g TU    T:X  aO  TR                  U 5        SR                  T V s/ s H  o R                  5       PM     sn 5      n[        SU S35      eTTU '   TR                  U 5        U R                  R
                   H  nX :w  d   eT" U5        M     TR                  5         TTU '   g s  sn f )Nz -> z_Cycle detected in memory planning graphPath containing cycle (i -> j: j is a dependency of i): zG This indicates invalid dependency relationships in the scheduler graph)r   joinrR   r   r   rI   pop)rl   	path_infor  BLACKGRAYcolor	dfs_visitpaths      r)   r-  )validate_graph_acyclic.<locals>.dfs_visit-  s    ;%;$KK$F]]_$FGIKKT+ VYZ  dD11I$$$i  2 	
d! %Gs   B<N)rl   r   r?   r@   )ra   fromkeys)rg   WHITErl   r*  r+  r,  r-  r.  s      @@@@@r)   validate_graph_acyclicr2    sM     !E4MM%'E$&D 2 ;%dO r(   c                r   U  H  nUR                  5        H  nUR                  5       nXQ;  a  [        U SUR                  5        S35      eX   U:w  a6  [        SU SU SUR                  5        SX   R                  5        S3	5      eXR;   d  M|  [        SU S	UR                  5        S
35      e   M     g)a>  
Validate that for each node's output buffer, the name_to_buf mapping is correct.
For each output buffer buf, we should have name_to_buf[buf.get_name()] == buf.
Also validate that no buffer names overlap with freeable input buffer names.

Raises:
    RuntimeError: If buffer name mapping is incorrect or names overlap
z from zN is not found in name_to_buf mapping. This indicates a missing buffer mapping.z&Buffer name mapping is incorrect for 'z'.Expected name_to_buf['z	'] to be zbut got z/This indicates some buffers share the same namez Buffer name conflict detected: 'z' from node z/ is also used as a freeable input buffer name. N)ry   rR   r   	debug_str)rg   r   rm   rl   r   r   s         r)   validate_unique_buffer_namesr5  K  s     ##%C||~H *"jt}}&7 8@ A  $+"<XJ G--5Ji?P{4>>@AEG  5"6xjT]]_L] ^E F + & r(   c                h    [        X5      n[        X5        [        XX5        [        XU5      u  pgXe4$ )a   
Prepare planning info. As nodes are scheduled one at a time, these help
keep track of when a buffer can be freed, and when a node can be scheduled

Returns:
    int: peak memory estimation
    dict[str, FreeableInputBuffer]: name to freeable input buffer
)ro   r   r   r   )rg   r   r   rh   r   rm   estimated_peak_memoryr   s           r)   prepare_planning_infor8  t  sD     "8!L5eI3;
  4=  !<<r(   c           
        [         R                  S[        U 5      5        [        U UUUU5      u  pg[        R
                  (       a  [        U UUUU5         [        U 5        [        XU5        / nUR                  [        XS5      5        [         R                  SU5        U H  n	 U	[        L a
  U	" XX5      n
OU	" U 5      n
[        U
5      [        U 5      :X  d   e[        XU5      u  pUR                  [        XU	R                   5      5        [         R                  SU	R                   U5        M     [%        SSS	U Vs0 s H  oR&                  UR(                  _M     sn0S
9  [+        US S9nUR,                  $ ! [         a)    [         R                  S5        [        5       (       d  e  GNAf = f! ["         a5    [         R                  SU	R                   5        [        5       (       d  e  GMN  f = fs  snf )z
Try a few heuristics based topological sort algorithms, and pick the one whose
resulting topological order has the lowest peak memory estimation.
z&Reordering for peak memory -- %d nodesz!Memory planning validation failedbaselinezBaseline peak memory: %dz%s peak memory: %dzFailed to reorder for %sinductorr   orm)categoryrM   
parametersc                    U R                   $ rP   )r   )xs    r)   r9   )reorder_for_peak_memory.<locals>.<lambda>  s    ammr(   r   )	torch_loginfor6   r8  r   reorder_for_peak_memory_debugexport_graph_for_simulatorr2  r5  r   	exceptionr   r   r   r   r   r"   	Exceptionr	   r   r   r   r   )rg   r   r   rh   r   methodsr7  rm   peak_memory_diff_methodsr   r   r   r   elembest_results                  r)   reorder_for_peak_memoryrL    s   " NN;SZH8M95 ++"&	
u%$U9ST 8:##zB NN-/DE 	..{ uu:U+++1=NK %++ V__E NN/+N * >VW>VdKK!1!11>VW
 .4KLK[  ?@{{ :  	 :FOOL;; 	 Xs+   E< &B	F2? G4</F/.F/2:G10G1c                  ^^  " S S[         5      n " S S[         5      n " S S[         5      n/ n/ n	UR                  5        H_  u  pU
UR                  R                  UR                  R                  UR                  R                  SX;   / / S.nUR	                  U5        Ma     U  VVs0 s H*  oR                  5         H  oR                  5       U_M     M,     nnnUR                  5        H  u  n
nUR                  c  M  UUR                  R                  5          R                  R                   Vs/ s H  nUR                  5       PM     nnU
UR                  R                  UR                  R                  UR                  R                  S
X;   UU V
s/ s H  oU;  d  M
  U
PM     sn
S.nUR	                  U5        M     U  H>  nUR                  5       [        UR                  5       5      S.nU	R	                  U5        M@     U	US.nSS	KnSS	KnSS	KnSSKJn  UR$                  R'                  U" 5       5      S   S-   mUR)                  USS9mUR*                  R-                  SU4S jU4S jS9  g	s  snnf s  snf s  sn
f )z
This is for debugging purposes. It will dump a json file that records graph information.
The graph can then be used in a simulator: https://fburl.com/code/3l3d3qi4
c                  f    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   S\S'   Srg)-export_graph_for_simulator.<locals>.ORMBufferi  r   rM   r   r-   r.   rG   r   is_input	is_output	list[str]deps
unmet_depsr    Nr!   r    r(   r)   	ORMBufferrO    s+    		r(   rU  c                  *    \ rS rSr% S\S'   S\S'   Srg)+export_graph_for_simulator.<locals>.ORMNodei   r   rM   rR  buffer_namesr    Nr!   r    r(   r)   ORMNoderW     s    	r(   rY  c                  *    \ rS rSr% S\S'   S\S'   Srg),export_graph_for_simulator.<locals>.ORMGraphi  zlist[ORMNode]rg   zlist[ORMBuffer]buffersr    Nr!   r    r(   r)   ORMGraphr[    s      r(   r]  T)rM   r-   r.   rG   rP  rQ  rS  rT  NF)rM   rX  )rg   r\  r   )get_graph_being_compiled_fusedr%  )indentartifactc                    > T SS.$ )Nstring)rM   encodingr    rQ   s   r)   r9   ,export_graph_for_simulator.<locals>.<lambda>O  s     
r(   c                    > T $ rP   r    )g_strs   r)   r9   re  S  s    5r(   )metadata_fn
payload_fn)r   r   rN   r.   r   ry   rR   defining_opr   rH   r-   r   r"  jsonosr;   functorch.compiler^  r.  splitextdumps_loggingtrace_structured)rg   rm   r   rh   r   rU  rY  r]  orm_buffers	orm_nodesr   r   orm_buf_input_bufferrl   r   r   r~   r  rS  orm_buf_scheduler_bufferorm_nodegrk  rl  r;   r^  rg  rM   s                             @@r)   rE  rE    s   I  )  !9 ! $&K!I  :??A#..88"--77((22!2	+
 	/0  B )./(-9I9I9K#9K  /  +002)  ( /%%..0h||$
$ $ 	 
 #..99"--77((22!2)-)-X1M/
  	34+  30 MMO !6!6!89
 	"  A :77467:XEDJJqJ#E	NN##
 ! $ g/
s   ,1I?8J	J
*J
)rg   r   rh   OrderedSet[str]r?   dict[str, FreeableInputBuffer])r   dict[str, SchedulerBuffer]r?   zdict[str, tuple[int, int]])rg   r   r   rz  r?   r@   )
rg   r   r   dict[str, BaseSchedulerNode]r   rz  rm   ry  r?   r@   )rg   r   rm   ry  r   rx  r?   z{tuple[list[BufferInfo], dict[BaseSchedulerNode, int], dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode]])rg   r   rm   ry  r   rx  r?   ztuple[int, list[int]])rg   r   rm   ry  r   rx  r?   ztuple[int, list[tuple[int, int]], dict[BaseSchedulerNode, SNodeMemory], dict[Union[FreeableInputBuffer, SchedulerBuffer], BaseSchedulerNode]])
rg   r   rm   ry  r   rz  r   rx  r?   r   )rg   r   r?   r   )rg   r   r?   r@   )rg   r   r   rz  rm   ry  r?   r@   )rg   r   r   rz  r   r{  rh   rx  r   rx  r?   z*tuple[int, dict[str, FreeableInputBuffer]])rg   r   r   rz  r   r{  rh   rx  r   rx  rH  z,list[Callable[..., list[BaseSchedulerNode]]]r?   r   )rg   r   rm   ry  r   r{  rh   rx  r   rx  r?   r@   )=
__future__r   r_   rA   r  loggingtypingr   r   r   r   r;   torch._environmentr   torch._utils_internalr	   torch.utils._ordered_setr
    r   r   r   r   utilsr   r   virtualizedr   collections.abcr   dependenciesr   ru   r   r   r   	getLoggerr"   rB  r  r   r+   rD   rK   ro   r   r   r   r   r   r   r   r   r   r  r#  r2  r5  r8  rL  rE  r    r(   r)   <module>r     sx   "     < <  ( 0 /  - 9  (!= ! h'	    
 
 
(    
 
 
.&".&!.& $.&bU+UUp.
".
+.
 
.
b:
":
4:
 ,:
 !?	:

 
:
|   V>"V> >V> #V>	V>r#+"#+ >#+ ##+ 	#+L   
:": >: #:	:zL"L >L ,L #	L
 L^EP'T+\&"&+& !?& 
	&R="=+= 5= "	=
 #= 0=H 	=V"V+V 5V "	V
 #V :V Vrj"j >j 5j "	j
 #j 
jr(   