
    ȅi3                       % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKJrJr  S SKJrJrJrJrJrJrJr  S SKJr  S SKJr  SSKJr  \(       a  S S	KJ r J!r!J"r"  S S
K#J$r$  S SK%r%S SK&r&S SK'r&S SK(J)s  J*r+  S SK,J-r-J.r.  S SK/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6  S SK7J8r8  S SK9J:r:J;r;J<r<  S SK=J>r>  SSK?J@r@JArAJBrBJCrCJrJDrD  SSKEJFrF  SSKGJHrHJIrIJJrJ  SSKKJLrLJMrM  SSKCJNrNJOrOJPrPJQrQ  SSKRJSrSJTrT  SSKUJVrV  SSKJWrWJXrXJYrYJZrZJ[r[J\r\  SSK]J^r^  SSK_J`r`Jara  SSKbJcrc  SSKdJereJfrf  SSKgJhrh  SSK)JiriJjrjJkrkJlrlJmrmJnrnJoroJprpJqrqJrrrJsrsJtrtJuruJvrvJwrwJxrxJyry  SS KzJ{r{  \R                  " \}5      r~\&R                  GR                  \}S!5      r\&R                  GR                  \}S"5      r\&R                  GR                  \}S#5      r\&R                  GR                  \}S$5      r\S%   rS&\S''   \" S(5      r\" S)5      r " S* S+5      r\GR                   " S, S-5      5       r\GR                   " S. S/\5      5       r " S0 S%5      r\GR                  SYS1 j5       rSZS2 jrS[S3 jrS\S4 jr\GR                  " S5S69 " S7 S85      5       rS]S9 jr " S: S;5      r        S^S< jr " S= S>\5      r " S? S@\5      r " SA SB\5      r    S_SC jr        S`SE jr " SF SG\5      r " SH SI\5      r " SJ SK\5      r " SL SM\5      r Sa       SbSN jjr      ScSO jrSdSP jr\GR                   " SQ SR5      5       r\GRJ                  " 5       rSeSS jrSfST jr    SgSU jr " SV SD5      r " SW SX5      rg)h    )annotationsN)Counterdefaultdict)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder
   PartitionType_T_Pc                  0   \ rS rSrSr\SS j5       r\SS j5       r\      SS j5       r	\SS j5       r
\      SS j5       r\      SS j5       r\SS	 j5       r\      SS
 j5       r\SS j5       r\      SS j5       r\SS j5       rSrg)MixOrderReductione   z
This class contains utility functions to decide if we should fuse reductions
reducing across different dimensions of the same input tensor.
c                p    U R                  5       =(       a     [        S U R                  5        5       5      $ )Nc              3     #    U  Hl  n[        U[        5      (       d  M  UR                  5       (       d  M1  [        UR                  [        5      (       d  MR  UR                  R
                  S Lv   Mn     g 7fN)
isinstanceSchedulerNodeis_reductionnoder   _split_size.0subnodes     S/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/scheduler.py	<genexpr>7MixOrderReduction.is_split_reduction.<locals>.<genexpr>m   sZ      +
+'=1 1 $$& 1 7<<8	 1GLL$$D0+s   A6A6A6A6)rd   all	get_nodesre   s    rj   is_split_reduction$MixOrderReduction.is_split_reductionk   s3      " 
s +
>>++
 (
 	
    c                   U R                  U5      (       Ga  S nS nUR                  5        GH  n[        U[        5      (       a4  UR	                  5       (       a  [        UR
                  [        5      (       d  MO  UR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUR
                  R                  c   e[        R                  R                  R                  [        UR
                  R                  5      5      nUc  UnUnGM  [        R                  R                  R                  X%5      (       d   U SU 35       e[        R                  R                  R                  X65      (       a  GM   U SU 35       e   Uc   eX#4$ UR                  S   $ )N v.s. r   )rp   rn   rb   rc   rd   re   r   _original_rangesrS   graphsizevarssimplifyrR   _original_reduction_rangesstatically_known_equalsgroup)clsre   xnumelrnumelri   	curxnumel	currnumels          rj   get_numel_rnumel"MixOrderReduction.get_numel_rnumelu   s   !!$''FF>>+w66,,.."7<<@@||44@@@GG,,55!',,"?"?@	 ||>>JJJGG,,55!',,"I"IJ	 >&F&F77++CC  4 	{34  77++CC  4 	{34 1 ,8 %%%##::a= rr   c                    U R                  U5      nU R                  U5      n[        U5      S:w  d  [        U5      S:w  d  X4:X  a  g[        U5      [        [        U5      5      :H  $ )N   F)r   lentuplereversed)r|   node1node2g1g2s        rj   has_mix_reduction_orders*MixOrderReduction.has_mix_reduction_orders   sX     !!%(!!%(r7a<3r7a<28RyE(2,///rr   c                   SnUR                   R                   H.  n[        U[        5      (       d  M  UR                  U:X  d  M,  Un  O   U(       d  gUR
                  nUR                   R                  nU(       dI  [        U[        5      (       d   [        U5       5       eUR                  S   R                   R                  nU(       d   e[        U5      [        UR                  5      -
  (       d  g[        R                  R                  R                  [!        UR"                  5      [!        UR%                  5       5      5      (       a  gg)z0
The access to 'buf' is not a broadcast access.
NFr   T)read_writesreadsrb   r/   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r   rS   rv   rw   rz   rR   sizevalues)r|   bufre   	found_depdepr   r   s          rj   _is_full_access!MixOrderReduction._is_full_access   s   
 	##))C#y))chh#o	 *
 %%00
d$677HDJ<H7Q33>>Jz:&E4F4F)GG
 7733)..)=9J9J9L+M
 
 rr   c                    / nUR                  5       UR                  5       -  nU HD  nU R                  XQ5      (       d  M  U R                  XR5      (       d  M3  UR                  U5        MF     U$ ra   )used_buffer_namesr   append)r|   r   r   outcommon_readsr   s         rj   get_common_read!MixOrderReduction.get_common_read   sb     ..053J3J3LLC""3..33F3Fs3R3R

3   
rr   c                <    [        U R                  X5      5      S:  $ Nr   )r   r   r|   r   r   s      rj   has_common_read!MixOrderReduction.has_common_read   s     3&&u4599rr   c                    U R                  U5      n[        R                  R                  R	                  US   US   -  SS9$ )Nr   r   fallback)r   rS   rv   rw   	size_hint)r|   re   r   s      rj   	get_numelMixOrderReduction.get_numel   s>    !!$'ww))"Q%"Q%-!)DDrr   c                $    U R                  U5      $ ra   )r   r   s      rj   get_fusion_score"MixOrderReduction.get_fusion_score   s    
 }}U##rr   c                  ^ ^ [         R                  R                  (       d  g[        R                  R
                  (       a  gUR                  5       (       a  UR                  5       (       d  gUR                  5       R                  nUS;  d  [        U5      S:w  a  gUR                  5       (       a  UR                  5       (       d  gUR                  UR                  5       -  (       d"  UR                  UR                  5       -  (       a  gT R                  X5      (       d  g[        R                  X5      n[!        U5      S:X  a  gT R#                  U5      n[$        R&                  " US   US   5      n[$        R(                  " US   US   5      nSn[        R                  R*                  R-                  [$        R.                  " Xg-  U5      5      (       d  g[        R                  R*                  R-                  [$        R.                  " XgS-  5      5      (       d  g[        R                  R*                  R-                  [$        R.                  " US5      5      (       d  g[        R                  R*                  R-                  [$        R0                  " US   U5      5      (       a  X4OX!4u  mn	[3        U U4S	 jTR4                  R6                   5       5      (       d  g[9        S
 TR;                  5        5       5      (       a  g[        R                  R*                  R=                  US5      (       d  g[3        S U	R;                  5        5       5      n
U
$ )z@
Check whether we can fuse two reductions with mix loop orders.
F)cudaxputritonr   r   i  P r   i   c              3  \   >#    U  H!  nTR                  UR                  T5      v   M#     g 7fra   )is_contiguous_loadr   )rh   r   r|   contiguous_nodes     rj   rk   -MixOrderReduction.can_fuse.<locals>.<genexpr>9  s,      
8 ""388_==8s   ),c              3     #    U  H]  nUR                  5       (       d  M  UR                  R                  R                  [        R
                  [        R                  4;  v   M_     g 7fra   )rd   re   datareduction_hintr>   INNERDEFAULTrg   s     rj   rk   r   @  sU      
 7##%GLL,,##%%
 7s   A'AA'i @  c              3     #    U  H9  nUR                  5       (       d  M  UR                  R                  5       S ;   v   M;     g7f)>   sumprodN)rd   re   get_reduction_typerg   s     rj   rk   r   S  s@      
 2##%GLL++-
 2s
   A$A)r#   r   mix_order_reductionrS   rv   cpp_wrapperrN   
get_devicer   rF   rd   	ancestorsget_operation_namesr   r]   r   r   r   sympyMaxMinrw   evaluate_exprGeEqrm   r   r   anyrn   statically_known_leq)r|   r   r   device_typer   r   nrowncol
size_thres
other_noder   r   s   `          @rj   can_fuseMixOrderReduction.can_fuse   s   
 }}00 77||~~U\\^^&&(--.";/8;!!##5+=+=+?+?OOe7799OOe7799  ++E99 )88F|!!!%(yyA1&yyA1&
 

 ww--ehht{J.OPP
 ww--ehhtAX.FGG
 ww--ehhtT.BCC ww--ehhr!ud.CDD N 	$   
&2288
 
 
   
 +446
 
 
 
 ww44T9EE  
 &//1
 
 
rr   c                $    U R                  X5      $ ra   )r   r   s      rj   are_mix_order_reductions*MixOrderReduction.are_mix_order_reductions^  s     ||E))rr   c                :   SSK Jn  UR                  5        H  n[        U[        5      (       d   eUR
                  nUR                  UR                     nU Vs/ s H   owR                  U:X  d  M  UR                  PM"     nn[        U5      S:X  a  M}  U Hy  n	UR                  U	   n
UR                  n[        UR                  5       5      n[        R                   R"                  R%                  U
UU5      nUS   S:X  a  Mm  US   S:X  a  Mx      g   M     gs  snf )Nr   )MemoryUsageTyper   FT)torch._inductor.loop_bodyr   rn   rb   rc   _bodymemory_usageLOADbuffer_name
index_namer   indexing_exprsr   listkeysrS   rv   rw   stride_vars)r|   r   parent_noder   re   	loop_bodyentrieseindex_namesr   
index_exprr   var_symbolsr   s                 rj   r   $MixOrderReduction.is_contiguous_loadd  s   =))+DdM2222

I,,_-A-ABG18QAMMS<P<1<<KQ;1$ *
&55jA
&11
 #:??#45gg..:: $B1,B10D  * ,2 + Rs   D1D Nre   rX   returnbool)re   rX   r   ztuple[sympy.Expr, sympy.Expr]r   rX   r   rX   r   r   )r   strre   rX   r   r   )r   rX   r   rX   r   	list[str])re   rX   r   intr   rX   r   rX   r   r   )r   r   r   rX   r   r   )__name__
__module____qualname____firstlineno____doc__staticmethodrp   classmethodr   r   r   r   r   r   r   r   r   r   __static_attributes__r   rr   rj   r]   r]   e   sb   
 
 
 #! #!J 	0%	0.?	0		0 	0  B 	%	.?			 	 :%:.?:	: :
 E E $%$.?$	$ $ s sj *%*.?*	* *
  rr   r]   c                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg) SchedulerBufferi  	Scheduler	schedulerz	ir.Bufferre   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr<   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ ra   )r  get_name)selfops     rj   defining_op_name SchedulerBuffer.defining_op_name  s#    ~~{{}rr   c                @    [        U R                  R                  5      $ ra   )hashre   r   r  s    rj   __hash__SchedulerBuffer.__hash__  s    DIINN##rr   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rK   r  	writeliner   re   r   layoutget_aliasespformatget_mutationsr   r	  indentgetrawvalue)r  resultr   users       rj   	debug_strSchedulerBuffer.debug_str  s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ ra   re   r  r  s    rj   r  SchedulerBuffer.get_name      yy!!##rr   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)re   should_allocateget_inputs_that_alias_outputget_mutation_namesrb   get_output_specr&   CommBufferLayoutrS   rv   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr  name_to_donated_buffername_to_bufcodegen_inplace_reuse)r  input_buffer_nameinput_buffers      rj   allocateSchedulerBuffer.allocate  sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rr   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)re   rb   r  r&   r:   rO   r	  
OutputNode)r  uses     rj   can_freeSchedulerBuffer.can_free  sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  rr   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g ra   )idre   merger   r   r	  )r  r	  r!  r?  s       rj   	set_usersSchedulerBuffer.set_users  sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
rr   c                T    U R                   c   eU R                   R                  5       $ ra   )re   r,  r  s    rj   r  SchedulerBuffer.get_aliases  s%    yy$$$yy5577rr   c                T    U R                   c   eU R                   R                  5       $ ra   )re   r-  r  s    rj   r  SchedulerBuffer.get_mutations  %    yy$$$yy++--rr   c                R    U R                   R                  5       R                  5       $ ra   )re   r.  r   r  s    rj   r   SchedulerBuffer.get_device  s    yy((*5577rr   )r	  Nr   r   r   r   r   Noner   r   )r	  r  r   rQ  r   zSequence[str]r   Optional[torch.device])r   r   r   r   __annotations__dataclassesfieldr   r	  r<   r
  r  r  r#  r  r:  r@  rE  r  r  r   r   r   rr   rj   r  r    sv    
O,,'--dCE>C.9.?.?3/J+ 
$$($?B
+8.8rr   r  c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBufferi  Nr  r  r   )r   r   r   r   r  rV  r   r   rr   rj   rZ  rZ    s    /3K,3rr   rZ  c                  t   \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   S\S'   S\S'   SrS\S'   STS jrSUS jr	SVS  jr
SVS! jrSVS" jrSWS# jrSVS$ jrSXS% jr      SYS& jrSZS' jrS[S( jrS\S) jrS]S* jr      S^S+ jrSXS, jrS_S- jrS_S. jrSXS/ jrSXS0 jr    S`S1 jrSVS2 jrSVS3 jr\S_S4 j5       r\S_S5 j5       r \S\S6 j5       r!\S\S7 j5       r"SaS8 jr#SbS9 jr$ScS: jr%SdS; jr&S\S< jr'S\S= jr(S\S> jr)S\S? jr*S\S@ jr+S\SA jr,S\SB jr-S\SC jr.SeSD jr/S\SE jr0SXSF jr1 Sf     SgSG jjr2\ShSH j5       r3\ShSI j5       r4\ShSJ j5       r5      SiSK jr6      SjSL jr7\SkSM j5       r8SlSN jr9\SlSO j5       r:SmSP jr;SnSQ jr<\=    SoSR j5       r>SSr?g)prX   i  OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r{   
last_usager   	min_order	max_orderr=   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]re   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFr   writtenc                     Xl         S U l        g )Nc                     / $ ra   r   )r*  kwargss     rj   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>  s    Brr   )r  debug_device_str)r  r  s     rj   __init__BaseSchedulerNode.__init__
  s    $-& 	rr   c           	     ^   Xl         [        5       U l        [        [           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l	        U R                   Vs0 s H  o3R                  5       U_M     snU l        0 U l        g s  snf s  snf )NF)r  re   r  )re   r   r   r   r]  rl  get_outputsr  r  rd  r  rf  rb  )r  re   outputr   s       rj   _init_from_node!BaseSchedulerNode._init_from_node  s    	#$
   **,
 - .. 
 -
 @D||L| 3|L !#
  Ms   B%;B*c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)r   r   r  r  s    rj   __repr__BaseSchedulerNode.__repr__(  s'    t*%%&fT]]_,?qAArr   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsr  (re   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r  Ignoring error in debug_str()Texc_info)r  rK   splicer   r   getattrr  r   writesrk  r   r  rv  r#  r  debug_str_extra	Exceptionlogwarningr   rstrip)r  r   r   r   s       rj   r#  BaseSchedulerNode.debug_str+  sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N r   r  s    rj   r  !BaseSchedulerNode.debug_str_extraD      rr   c                $    U R                  U 5      $ ra   )rr  r  s    rj   _debug_str_for_device'BaseSchedulerNode._debug_str_for_deviceG  s    $$T**rr   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Nr   r  , F)shorten	multiline)r  re   rb   torch	_inductorr&   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )r  
maybe_datadata_strs      rj   debug_str_short!BaseSchedulerNode.debug_str_shortJ  s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""rr   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r  infork  r   r  r  s    rj   log_detailsBaseSchedulerNode.log_detailsY  s,    6####		
rr   c                    gNFr   )r  self_dep	other_deps      rj   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_paira       rr   c                    S U R                   R                  5        5        Vs0 s H  nX!;   d  M
  X!U   _M     snU l        U R                  U R                   R	                  U R                  5      5        g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fra   r   rh   r   s     rj   rk   9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>i  s     Q-Pc-P   )r   reads_and_writesrb  set_read_writesrename)r  renamesr   s      rj   update_mutated_names&BaseSchedulerNode.update_mutated_namesf  ss     RT-=-=-N-N-PQ!
Q  D$-Q!

 	T--44T5J5JKL!
s
   	A7	A7c                X    U R                  U R                  R                  U5      5        g ra   )r  r   	with_readr  r   s     rj   add_fake_depBaseSchedulerNode.add_fake_depn  s!    T--77<=rr   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7fra   )r  r  )rh   r   s     rj   rk   =BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  s*      
@ROO4!2!2!44@Rs   35)r   rv  r  s    rj   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutationq  s%     
@D@P@P@R
 
 	
rr   c                f    Xl         U R                   R                  U l        U R                  5         g ra   )r   r   rk  
prune_deps)r  rws     rj   r  !BaseSchedulerNode.set_read_writesv  s&    "&"2"2"8"8rr   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7fra   )get)rh   kmutation_real_names     rj   rk   3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r]  )r  future_used_buffersr  used_bufferss     ` rj   set_last_usage BaseSchedulerNode.set_last_usage{  s-     88:!!U!UU&<rr   c                J    U R                    H  nUR                  5         M     g ra   )rd  r:  )r  r   s     rj   mark_runBaseSchedulerNode.mark_run  s    <<CLLN  rr   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7fra   r  r  s     rj   rk   6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
W HHW   )r   	itertoolschainr   r   r  r  s    rj   r   #BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rr   c                  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H7  n[        U[        5      (       a  UR                  (       a  M+  UR                  PM9     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R!                  U5      (       aD  UR#                  U4S j[        R                  R                  U   R%                  5        5       5        [        U5      S:  a  M  T$ s  snf )z
Returns buffer names used by this node, including aliases.

Note: is_fake WeakDeps are excluded since they are purely for ordering
and should not affect buffer lifetime.
r   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7fra   r   )rh   alias
used_namess     rj   rk   ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s(      "5 J.	 E"5s   
	)r   r  r  r   r   r  rb   r1   is_faker   r   popaddrS   rv   name_to_bufferr  extendr,  )r  r   depsr  s      @rj   r  .BaseSchedulerNode.used_or_aliased_buffer_names  s     '1l
 !t'7'7'='=t?O?O?V?VW
WsG,, CHHW 	 

 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m !
s   *E;Ec                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7fra   )r   r  available_buffer_namesrh   r   r  s     rj   rk   /BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s0      -
.xxt~~DDD C.s   (8	8r   rk  r  s   `rj   r  BaseSchedulerNode.prune_deps  s#    ", -
..-
 #
rr   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                   > [        U [        5      (       d  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ r  )	rb   r1   r  r6  r   r  rS   rv   removed_operations)r   op_namer  s     rj   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sI    c7++nn00:KKMGagg8888rr   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fra   r   rh   r   r  s     rj   rk   4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
1C\#5FCC1   !	!r   r.   r   r   )r   r   r   r  remove_reads)r  	to_remover  s   ` @rj   prune_weak_deps!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFrr   c                D    [        XU R                  R                  5        g ra   )_prune_redundant_depsr  r6  )r  name_to_fused_nodes     rj   prune_redundant_deps&BaseSchedulerNode.prune_redundant_deps  s     	d8R8RSrr   c                T    U R                   c   eU R                   R                  5       $ ra   )re   get_operation_namer  s    rj   r  BaseSchedulerNode.get_name  rK  rr   c                "    U R                  5       $ ra   r  r  s    rj   get_first_name BaseSchedulerNode.get_first_name  s    }}rr   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   r  rh   re   s     rj   rk   8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     G6Fd--//6F   )r   rn   r  s    rj   r   %BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   r  rh   r   s     rj   rk   5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     ALS,,..Lr  )r   rd  r  s    rj   get_buffer_names"BaseSchedulerNode.get_buffer_names  s    ADLLAAArr   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNrb   rc   r(   rh   ns     rj   rk   ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s6      
 & q-( G+AFG%s   .0rm   rn   r  s    rj   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rr   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7fra   r!  r"  s     rj   rk   @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1r%  r  s    rj   r(   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rr   c                    U /$ ra   r   r  s    rj   rn   BaseSchedulerNode.get_nodes  s	    vrr   c                    U R                   $ ra   )rd  r  s    rj   rv  BaseSchedulerNode.get_outputs  s    ||rr   c                     U R                   U   $ ra   )rf  )r  buf_names     rj   
get_outputBaseSchedulerNode.get_output  s    ##H--rr   c                T    U R                   c   eU R                   R                  5       $ ra   )re   r   r  s    rj   r   BaseSchedulerNode.get_device  s%    yy$$$yy##%%rr   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r   r   r  devices     rj   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rr   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ ra   )r   rN   r   r9  s     rj   rN   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rr   c                    gr  r   r  s    rj   rd   BaseSchedulerNode.is_reduction      rr   c                    gr  r   r  s    rj   is_native_matmul"BaseSchedulerNode.is_native_matmul  rA  rr   c                    gr  r   r  s    rj   is_split_scanBaseSchedulerNode.is_split_scan  rA  rr   c                    gr  r   r  s    rj   is_templateBaseSchedulerNode.is_template  rA  rr   c                    gr  r   r  s    rj   	is_externBaseSchedulerNode.is_extern  rA  rr   c                    gr  r   r  s    rj   
is_foreachBaseSchedulerNode.is_foreach  rA  rr   c                    gr  r   r  read_deps     rj   can_inplaceBaseSchedulerNode.can_inplace  rA  rr   c                    gr  r   r  s    rj   has_side_effects"BaseSchedulerNode.has_side_effects  rA  rr   c                X  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GHQ  nUR2                  nUc   eUR5                  5       (       aV  UR7                  5       (       dA  UR9                  5       (       d,  UR;                  5       [        R                  R<                  ;   a  M  T R>                  R@                   GH  nURB                  T R,                  RD                  ;   a$  T R,                  RD                  URB                     nO/T R,                  RF                  RI                  URB                  5      nU(       d  M  [        R                  RJ                  RM                  UT 5      (       d  M  [        URN                  [P        5      (       a  M  URR                  c   eURR                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	n[U        U	5      S:X  d  GM3  U	S   RV                  (       d  GMJ  U	S   R2                  T L d  GM_  UR2                  c  GMo  [        UR2                  RY                  5       [Z        R\                  [Z        R^                  [Z        R`                  45      (       a  GM  URN                  (       am  [        URN                  R2                  [Z        Rb                  [Z        Rd                  45      (       a*  [U        UR2                  R7                  5       5      S:  a  GMF  U" UR2                  UR2                  5      (       d  GMk  U" U5      (       d  GM{  [        R                  Rf                  Ri                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rj                  Rm                  UR;                  5       5        [        R                  Rj                  Rm                  UR;                  5       5        UR;                  5       [        R                  Rn                  UR;                  5       '     GMO     GMT     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   )can_match_buffer_size	mutationsNr*  c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H  nUR
                  n[        U[        5      (       d  M&  UR                  5       U R                   R                  ;  d  U R                   R                  U5      ULa  Mn  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7fra   r  )rh   or1  s     rj   rk   ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>>  s&      Evv) AEs   $	$r   FT)r  get_fused_noder  r   r	  re   rb   rX   r  r  r   r  r   )buf_to_be_inplaced
fused_noder  r"  	user_noder1  r  s        @rj   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node&  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= ' 1* rr   r   )ra  r  r   r   )8codegen.wrapperrZ  rb   rc   r#   inplace_buffersrS   rv   has_featurer   r)   INPLACE_BUFFERSr3  r  r  codegensimd
SIMDKernelr  r2  r   r  r  completed_operationsrv  re   r+  r,  r-  r  removed_buffersr   r   r   r5  r6  r  r0  	can_reuser  NopKernelSchedulerNoder	  r   rT  r.  r&   r:   r9   MutationLayoutSHOULDREMOVEFallbackKernelr8   r*  make_inplacer[  r  r4  )
r  rZ  inconsequential_nodesrd  r   buf_noderead	input_bufxremaining_usess
   `         rj   decide_inplace_update'BaseSchedulerNode.decide_inplace_update  s   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 	 	D ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G q / &0&s   "V'V'c                   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH5  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	S
S9S   nUR                  SUR                  SS5      R                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM8     [        U5      S:X  a  g UR                  U5        SU l        g )Nrw  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r#   comment_originrl  re   get_originsr  r   targetmetarsplitreplacer   
writelines)	r  buffer	only_onceorigins	out_linesr^  op_info_strr  stack_trace_last_lines	            rj   codegen_originating_info*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3 6 y>Q 	)$rr   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implr  s    rj   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rr   c                "    U R                  SSS9$ )NTFr  r  r  s    rj   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rr   c                "    U R                  SSS9$ )NFTr  r  r  s    rj   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
rr   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)r   get_read_write_buffer_accessesr   )r  r  r  s      rj   r  3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s1     //+ 0 fh	
 	
rr   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ [        T [        5      (       af  [        T R                  [
        R                  5      (       a=  T R                  R                  [        R                  R                  R                  L a  0 $ SS jm[        T [        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[         R"                  " [$        5      nU(       a:  T R&                  R(                   H   nX4R*                     R-                  U5        M"     U(       a:  T R&                  R.                   H   nX4R*                     R-                  U5        M"     U(       a&  [1        S T R&                  R(                   5       5      O	[1        5       nU(       a&  [1        S T R&                  R.                   5       5      O	[1        5       nSU 4S jjm[        T [2        5      (       a  [1        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	[5        U4S	 jX9    5       5      mU	[6        R8                  R:                  ;   a  [6        R8                  R:                  U	   n
O>U	[6        R8                  R<                  ;   a  [6        R8                  R<                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   r   )rS   rv   rw   r   )ss    rj   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<rr   r   r       eAc              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     B+ACxx+Ar  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r         C+BCxx+Br  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7fra   ro   )rh   r"  s     rj   rk   \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>#  s     !>))r  r   )r  r6  r	  r   r   )r   r   r	  buf_usesr  s       rj   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized!  sG    NN..s399E!!>!>>Hx*V"44599rr   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7fra   r   )rh   r   r  r  s     rj   rk   r  '  s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7fra   r   )rh   r   
node_numels     rj   rk   r  0  s     $R;QCZ;Qs   c                R  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [        U4S jU R!                  5        5       5      $ T	" [#        U R%                  5       5      5      n['        U R)                  5       5      [+        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7fra   )rS   rv   
get_buffer)rh   mut_nameget_buf_bytess     rj   rk   ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>R  s/      (@H &agg&8&8&BCC(@   25)rb   r&   TorchBindObjectr  r  r9   r  r6  r  r	  re   rX   r8   rv  r:   r   r-  rR   r  rH   	get_dtypemin)
r   r	  totr"  	sched_buf	buf_elemsbuf_accessed_elemsr  r  r  s
         rj   r  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes9  sG    c2#5#566,,..

,=>> !NN66s||~FLLEC %)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rr   )r  z
sympy.Exprr   r   )r   r   r   Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )rb   rp  ExternKernelSchedulerNodere   r8   r&   rr  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_staterc   rR   
get_rangesr   collectionsr   r   r   r   r   r   r  r   r   r   rS   rv   r  graph_inputs)r  r  r  buf_accessesr   r   r  rn  buf_byte_accessesr1  r   	buf_bytesr  r  r  r  r  s   `           @@@@@rj   r  0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233Id566:II{<
 <
 It677499b&7&788		%%||%%BBC I	= dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I0.7(+!+y8+c 'f ! rr   c                ^   U R                   c  g U R                   R                  5       nUc  g [        U5      nUc  g [        U[        R
                  5      (       a  UR                   R                  n[        R                  R                  R                  USS9n[        S   S==   U-  ss'   U$ )Nr   r   inductor
flop_count)re   get_origin_noder4   rb   r  SymIntexprrS   rv   rw   r   r   )r  fx_nodeflopsresolved_flopss       rj   estimate_flops BaseSchedulerNode.estimate_flopsd  s    99))++-?w'=eU\\**JJOOE))33EA3F\*n<*rr   c                T    U R                   b  U R                   $ U R                  5       $ ra   )rh  _get_estimated_runtimer  s    rj   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtimew  s)    **6222**,,rr   c                ,   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       a  [        U R                  [        R                  5      (       d   e [        R                  (       av  [        U 5      n[        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ [!        U 5      nUc  [#        U R                  5      nUR%                  X6S9  U$ [#        U R                  5      $ [/        U R                  5      (       a  g[1        U 5      nUb  U$ UR                  R3                  5       n	 [5        5       n
[7        U	5      S-  nU
S::  a  [9        SU
 35      eUS::  a  [9        SU 35      e U R=                  5       nUS:X  d  Uc  U R?                  5       U
-  nUS-  nU$ SnU R?                  5       nUc  SOUnX-  U-  S	-  nX-  n[A        UU5      nUS-  nU$ ! [&         a  n[(        R+                  U5         SnAgSnAf[,         a  n[(        R+                  U5         SnAgSnAff = f! [:         a     gf = f)
z3
Returns estimated op runtime in milliseconds (ms)
r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!rn   rv  re   r.  rN   r6   rL   rb   r&   IRNoder$   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr-   r,   	set_value
ValueErrorr  r  	TypeErrorrQ    maybe_estimate_runtime_benchmarkmaybe_get_dtyperI   rG   AssertionErrorr  r  r  max)r  r   r  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     rj   r  (BaseSchedulerNode._get_estimated_runtime}  s   
 nnq!--/2))+of-.. ##dii3333LL I$ OI68E %Y 7I ,))U;;;;((HNBz=diiHOOIO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%< }-#X	o    :  		sD   AH3 63H3 *H3 A J 3
J=IJ$I>>J
JJc                    g ra   r   r  s    rj   get_template_node#BaseSchedulerNode.get_template_node      rr   c                0    U R                  5       nUc   eU$ ra   r  )r  templates     rj   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rr   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7fra   rI  )rh   ir#  s      rj   rk   CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     P,<DAaa,<s   *	*Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        rj   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epilogue  sH     PIe,<PP.)-!+-.00rr   )r   rr  r]  rb  re   rd  rf  r   r  rk  rl  )r  r  r   rQ  )re   ir.Operationr   rQ  rN  )r   r   rP  r  r/   r  r/   r   r   r  ra  r   rQ  )r   r.   r   rQ  rR  )r  ri  r   rQ  r  r\  r  ra  r   rQ  r   r\  r  dict[str, BaseSchedulerNode]r   rQ  r   r  )r   zSequence[SchedulerBuffer])r1  r   r   r  rT  rS  zdependencies.Depr   r   T)r  rK   r  r   r   rQ  rO  )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int]r   z
int | Noner   r  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)r  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]])@r   r   r   r   rV  re   rh  rl  rs  rx  r}  r#  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r	  r  r  rC   r   r  r&  r(   rn   rv  r2  r   r;  rN   rd   rC  rF  rI  rL  rO  rT  rW  rz  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r   rr   rj   rX   rX     s   BB NN''$$#'D
 '""//266((''GT
#0B*2+#
!.7	
M>


=#2=HV=	=
6
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X  $- U Un
 1&1	S1 1rr   c                 R    [         R                  R                  R                  5       $ ra   )r  r  	codecache
LocalCacher   rr   rj   r  r    s    ??$$//11rr   c                  ^ [        U R                  SS5      nU R                  R                  nU R                  R                  / UQU R                  R                  QU R                  R
                  5      nU R                  R
                  n[        R                  " X#45      u  pESS jm[        U4[        U4S jU 5       5      -   5      nU$ )Npython_kernel_namer  c                    [        U [        R                  5      =(       a    [        U [        R                  5      (       + $ ra   )rb   r&   r  GeneratorStaterx  s    rj   _is_tensor_ir@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPrr   c              3  t   >#    U  H-  nT" U5      (       a  [        UR                  5       5      OS v   M/     g 7fra   )r   r  )rh   ar1  s     rj   rk   <get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>   s+     U9a}Q'7'7ajjl#TA9s   58rR  )
r  re   inputsfill_non_provided_argsconstant_argsro  pytreetree_flattenr   r   )snoder-  r*  ro  	flat_argsflat_args_pytree_specr  r1  s          @rj   r  r    s     -A2F::D::,,*$*))*

D ZZF'-':':D>'J$IQ 	
U9U
U	VI rr   c                   [        U [        5      (       d  g [        R                  R                  R
                  [        R                  R                  R                  [        R                  R                  R                  S.n[        U R                  SS5      nX!;  a  g [        U R                  [        R                  5      (       d  g X   $ )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr-  r  )rb   r  r  opsatenmmbmmaddmmr  re   r&   ExternKernel)r;  mms_fnsr-  s      rj   _get_mm_like_fnrF    s    e677"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//22&&rr   c           	     d  ^ ^ S nS n[         R                  (       a  [        T 5      nUc  g UnU U4S jnOg [        T 5      n[	        5       nUR                  U5      nUb  [        U[        5      (       d   eU$ SSKJ	m  U" 5       u  pxSSK
Jn	  U	R                  UUUSSSS9n
UR                  XJS	9  U
$ )
Nc                    > T" T 5      $ ra   r   )r;  snode_args_kwargss   rj   rp  2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    !25!9rr   r   )rI  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r#   !runtime_estimations_mms_benchmarkrF  r  r  r  rb   r  utilsrI  $torch._inductor.runtime.benchmarkingrK  	benchmarkr  )r;  bench_fnargs_kwargs_fnmm_fnr  r  r  r*  ro  rK  r  rI  s   `          @rj   r  r    s    HN//&=99%@I&(EY'I)U++++(!#LD@			! 
 
B 
OOIO(Irr   T)slotsc                  \    \ rS rSr% S\S'   S\S'   S\S'   S\S'   SS jrSS	 jrSS
 jrSrg)	WhyNoFusei;  r   name1name2reasonztuple[Any, ...]r*  c                X    UR                  5       U l        UR                  5       U l        g ra   )r  r[  r\  r  r   r   s      rj   rs  WhyNoFuse.__init__B  s    ^^%
^^%
rr   c                F    Xl         X l        [        R                  U 5        g ra   )r]  r*  
fusion_logdebug)r  r]  r*  s      rj   __call__WhyNoFuse.__call__F  s    	rr   c                p    SU R                    SU R                   S3U R                  U R                  -  -   $ )Nzcannot fuse z with r  )r[  r\  r]  r*  r  s    rj   __str__WhyNoFuse.__str__K  s6    djj\

|2>KK$))#
 	
rr   )r*  r[  r\  r]  Nr   rX   r   rX   r   rQ  )r]  r   r*  r   r   rQ  rN  )	r   r   r   r   rV  rs  rd  rg  r   r   rr   rj   rZ  rZ  ;  s&    JJK
&

rr   rZ  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )r  r      )	rb   r   setsortedr   pprintr  textwrapr  )objr!  s     rj   r  r  Q  sU    #
C())Sc"^^C*Fv~HOOFG4566Mrr   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r>  i[  c                &    [        U/5      U l        g ra   r  r  s     rj   rs  OutputNode.__init__\  s    ",cU"3rr   c                    gr  r   r  s    rj   rd   OutputNode.is_reduction_  rA  rr   c                    g)Nr   r   r  s    rj   r,  'OutputNode.get_inputs_that_alias_outputb  r  rr   c                    g)NOUTPUTr   r  s    rj   r  OutputNode.get_namee  s    rr   )rk  N)r   r0   r   rQ  rR  rS  rN  )
r   r   r   r   rs  rd   r,  r  r}  r   r   rr   rj   r>  r>  [  s    4 Hrr   r>  c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                  > [        U [        5      (       ap  TU R                     R                  5       nTTU   R	                  5          S:  =(       a     TR
                  R                  U TU   T5      nTU   T:H  nU=(       d    U$ g)Nr   F)rb   r1   r   r  r  r  fusable_weak_dep)r   r  is_redundantis_self_depr6  name_to_dep_countr  re   s       rj   r  +_prune_redundant_deps.<locals>.should_prune  s    c7##!#((+<<>G,"7+446 nn55'0$  -W5=K.;.rr   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7fra   r   r  s     rj   rk   (_prune_redundant_deps.<locals>.<genexpr>  s      .,s2C.r   Nr  )r  r   rk  rb   r1   r   r  r  r   r  r   r  )re   r  r6  r   r  deps_to_pruner  r  s   ```   @@rj   r  r  k  s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '
    .. M "&"9"9M"IT--::=IJ rr   c                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g ra   superrs  rx  r  get_read_writesr  r  re   	__class__s      rj   rs  "ExternKernelSchedulerNode.__init__  5    #T"T1134rr   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = r-  )r  r  re   r  s    rj   r  )ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbrr   c                    gNTr   r  s    rj   rL  #ExternKernelSchedulerNode.is_extern  r  rr   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )NrW  )re   r2  rW  r  s    rj   rW  *ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVrr   r   r  r  re   r  r   rQ  rN  rR  )
r   r   r   r   rs  r  rL  rW  r   __classcell__r  s   @rj   r  r    s    5
cW Wrr   r  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )rp  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g ra   r  r  s      rj   rs  NopKernelSchedulerNode.__init__  r  rr   r   r  )r   r   r   r   rs  r   r  r  s   @rj   rp  rp    s    5 5rr   rp  c                    ^  \ rS rSr% SrS\S'   S\S'         S#U 4S jjr  S$     S%S jjr  S$     S&S	 jjr      S'S
 jr	S(S jr
S)S jrS*S jrS)S jr      S+S jrS)S jr      S,S jrS-S jrS.S jrS/S jrS/S jrS/S jrS/S jrS0S jrS1S jr    S2S jrS3S jr S4   S5S jjr\S6S j5       r\S6S j5       rS7S jr\S8S  j5       r \S/U 4S! jj5       r!S"r"U =r#$ )9rc   i  zi
A SchedulerNode is a node for scheduling that encapsulates either
a ComputedBuffer or a TemplateBuffer.
z tuple[Sequence[sympy.Expr], ...]_sizesr;   r   c                f   > [         TU ]  U5        U R                  U5        U R                  5         g ra   )r  rs  rx  _compute_attrsr  s      rj   rs  SchedulerNode.__init__  s,    
 	#T"rr   c                   [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        nX0l        U R                  R                  5       nU R                  R                  U5      R                  nXE" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )rb   re   r&   r   TemplateBuffersimplify_and_reorderr  r   get_device_or_errorr  get_backendgroup_fnr{   r#   loop_ordering_after_fusionrN   r   r  extract_read_writesr%   )r  r  r  bodyr:  r  should_normalizes          rj   r  SchedulerNode._compute_attrs  s;   
 $))b&7&79J9J%KLLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hrr   c                $    U R                  UUS9  g )Nr  )r  )r  r  r  s      rj   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rr   c                   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      R                  U R                  5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R!                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7fra   )rb   r1   r0   r  s     rj   rk   5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s$      0
1CZgwEW5XCC1s   .	.r  r   SIMDScheduling)r   r   r   r  r%   r  r   r  r  r  rb  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)r  r  need_clear_tiling_cache	fake_depsr  s        rj   refresh_dependencies"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rr   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr  r  )r   reorder_iter_loopssizesr  r  )r  	new_orders     rj   apply_new_loop_order"SchedulerNode.apply_new_loop_order  sA    ZZ22

 jj&&!!E4!Prr   c                   U R                   R                  5       n[        U R                   R                  5      U-
  n[	        [        U5      5      n[	        [        X"U-   5      5      nU R                  XC-   5        [        U R                  S   5      S:X  d   eU R                  S   U R                  S   S   U R                  S   S   44U l        g )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   ranger  r{   )r  	num_rdims
num_pwdimspwdimsrdimss        rj   swap_pw_red_dimension#SchedulerNode.swap_pw_red_dimension  s    JJ557	--.:
uZ()eJY(>?@!!%.14::a=!Q&&&ZZ]TZZ]1%5tzz!}Q7G$HH
rr   c                D    U R                   R                  5       U l         U $ ra   )r   extract_pw_from_reductionr  s    rj   r  'SchedulerNode.extract_pw_from_reduction  s    ZZ99;
rr   c                   [         R                  U 5      (       d  g [        U R                  [        R
                  5      (       d   eU R                  R                  5          U R                  5         S S S 5        g ! , (       d  f       g = fra   )r]   rp   rb   re   r&   r   with_original_inner_fnr  r  s    rj   cancel_reduction_split$SchedulerNode.cancel_reduction_split!  s[     33D99$))R%6%67777YY--/! 0//s   !A;;
B	c                   [        U R                  [        R                  [        R                  45      (       d   eU R
                  R                  X5      U l        U R
                  R                  U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        U R                  SSS9  g )NTr  )rb   re   r&   r   r  r   #expand_dimension_for_pointwise_noder  r  r  r  r  r  r{   r  )r  	dimension	new_ranger:  r  s        rj   r  1SchedulerNode.expand_dimension_for_pointwise_node(  s     $))b&7&79J9J%KLLLLZZCC

 jj&&..0>>--f5>>ht{{34
 	!!D$!Orr   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFr  )r   merge_loopsr  r  r  r  s    rj   r  SchedulerNode.merge_loops9  s<    ZZ++-
jj&& 	!!D%!Prr   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g[        R                  SU R                  5       5        g)Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr'   num_loop_reorderingloop_ordering_logrc  r  r  )r  r  r  r  
self_sizess        rj   r  'SchedulerNode.reorder_loops_by_dep_pairE  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rr   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:rn  r  )r  r{   r  r   r  rb   r1   r   rS   rv   r  r&   r  r   r  r  r   r;   rr  r  r#  re   r  r  join)r  r   linesr   r1  r   s         rj   r  SchedulerNode.debug_str_extra\  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyrr   c                    U R                   $ ra   )r  r  s    rj   r  SchedulerNode.get_rangesr      {{rr   c                d   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      =(       a0    U R                  S L =(       d    U R                  R                  (       + $ Ntype(self.node)=)
rb   re   r&   r   r  r   r   r   r   has_partial_accumulater  s    rj   rd   SchedulerNode.is_reductionu  s    $))b&7&79J9J%KLL 	
tDII !	
L DII0023 
JJ$Gdjj&G&G"G	
rr   c                    [        U R                  [        R                  5      (       d   S[	        U R                  5      < 35       eU R                  R                  5       S:H  $ )Nr  dot)rb   re   r&   r   r   r   r  s    rj   rC  SchedulerNode.is_native_matmul  sM    $))R%6%677N<LDO;M9NN7yy++-66rr   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r  )rb   re   r&   r   r  r   r   	SplitScanr  s    rj   rF  SchedulerNode.is_split_scan  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rr   c                J    [        U R                  [        R                  5      $ ra   rb   re   r&   r  r  s    rj   rI  SchedulerNode.is_template  s    $))R%6%677rr   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ ra   r  r  s    rj   r  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$Nrr   c                f    U R                  5         U R                  5         U R                  U5        g ra   )rz  r  rj  )r  
index_varss     rj   runSchedulerNode.run  s#    ""$Z rr   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ ra   )	r  r   mapr   dictzipr  r  from_iterable)r  r  r  r   s       rj   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rr   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f! [         a"    [        R                  SU R                  5        e f = f)a  
Generate code for this node using the provided index variables.

This method sets up the appropriate context for code generation, including
simplifying indexing expressions based on the variable ranges, and then
calls the node's body function with the index variables.

Args:
    index_vars: A sequence of sequences of sympy expressions representing
                the index variables for each dimension of the computation.
NzError in codegen for %s)r
  rS   set_ops_handlerrA   get_ops_handlerr3  set_current_noder   r  r  fatalre   )r  r  r   s      rj   rj  SchedulerNode.codegen  s     00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r  r   r%   r  r   r   SZeror   )r  	pointwise
keep_sizesignore_sizess       rj   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writes  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
rr   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
Tr  r  r  s    rj   r  #SchedulerNode.pointwise_read_writes  s    
 666FFrr   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr  r  r  s    rj   reduction_read_writes#SchedulerNode.reduction_read_writes  s    
 666GGrr   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7fra   )r  r  s     rj   rk   ,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?,>S  ,>r  r   ztype(write_dep)=)rI  r   rv  r   r   r  rb   r%   r/   r  iterr   r   r   )r  rS  	write_deps      rj   rT  SchedulerNode.can_inplace  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXrr   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_addrL  rm  r   r   r   r  )r   rb   r   r;   rn   r  r  ro  r   r*  r  )r  buffers_store_as_atomic_addre   s      rj   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*rr   c                |   > U R                   b!  U R                   R                  S5      (       a  g[        TU ]  5       $ )Ndevice_assert_asyncT)r   has_opr  rW  r  r  s    rj   rW  SchedulerNode.has_side_effects  s5     ::!djj&7&78M&N&Nw'))rr   )r   r  r{   )r  r  re   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   rQ  NN)r  *Optional[tuple[dict[Any, Any], list[Any]]]r  zOptional[Callable[_P, _T]]r   rQ  )r  r6  r  zOptional[Callable[..., Any]]r   rQ  )r  r   r  r   r   rQ  )r  Sequence[int]r   rQ  rP  r   rX   )r  r   r  r   r   rQ  r  rN  )r   Sequence[Sequence[sympy.Expr]]rR  r'  )r  Sequence[sympy.Expr]r   rQ  )r  r9  r   zdict[sympy.Expr, sympy.Expr])r  r9  r   rQ  r$  )r  r   r   ri  )r   ri  r#  r  )$r   r   r   r   r   rV  rs  r  r  r  r  r  r  r  r  r  r  r  r  rd   rC  rF  rI  r  r  r
  rj  r  rC   r  r   rT  r.  rW  r   r  r  s   @rj   rc   rc     s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QI"PP),P	P"
Q!.7	. ,
7
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rr   rc   c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fra   r   r  )rh   r   group_snodes     rj   rk   2refresh_group_node_dependencies.<locals>.<genexpr>  s/      
Pxx{;;== CP   "2	2)
r   r  r%   
ReadWrites
merge_listr   r   unionrk  r  )r>  r   rx  s   `  rj   refresh_group_node_dependenciesrD    s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8r  c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fra   r^  rh   rx  s     rj   rk   "init_group_node.<locals>.<genexpr>        H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7fra   )r_  rH  s     rj   rk   rI  !  rJ  r  )rb   r   GroupedSchedulerNoder   r  re   r   rC  r   rD  r  r^  r  r_  rv  r  rf  )r>  r  r   rx  r   s        rj   init_group_noderM    s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      S#S j5       rS$S jrS%S jr	\
S&S j5       r      S'S	 jrS(U 4S
 jjr\
S)S j5       rS)S jr\
S*S j5       rS+S jrS)S jrS)S jr      S,U 4S jjr\
S*S j5       r\
S*S j5       rS-S jrS)S jr\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S.S j5       r\
S/S j5       rS0S jr\
S.S j5       rS1S jr S2S jr!S3S jr"S)S  jr#\
S.U 4S! jj5       r$S"r%U =r&$ )4r   i'  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
r(  r   c           	        UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       Ga  [        U[
        5      (       Ga  [        UR                  [        5      (       d   e[        UR                  R                  5      S:X  d   e[        [        [        UR                  R                  5      5      [        5      (       d   e[        [        UR                  R                  5      5      R                  nUR                  5        Vs/ s H  oDR	                  5       (       d  M  UPM     nn[        U5      S:X  d   eUS   n[        UR                  R                  5      S:X  d   e[        [        UR                  R                  5      5      n[        U[         5      (       d   e[#        [!        X7R$                  UR&                  UR(                  UR*                  5      /5      UR                  l
        O[        U[        [        45      (       d   e[-        [.        R0                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ s  snf )Nr   r   )r  rb   rc   r   rI  r  re   r8   r   r   r  r  r%  r0   r   rn   r/   r   r   	var_namesr   r+  r   r  r  )	r|   r   r   r   re   template_nodesr  writer  s	            rj   fuseFusedSchedulerNode.fuse0  s    %//111%-1C!DEEEE:e5N#O#O ejj+6666u((//0A555d4(9(9(@(@#ABGLLLLU..5567<<D/4/@W/@tDTDTDVd/@NW~&!+++*1-M}00778A===m77>>?@EeY////'1kk5??EJJ

(E$ em5G%HIIIIY__U__%68IJK5??E**! Xs   ,JJc                    U R                    HA  n[        U[        5      (       d   eUR                  5       (       d   eUR	                  5         MC     U $ ra   )r   rb   rc   rd   r  r  ri   s     rj   r  ,FusedSchedulerNode.extract_pw_from_reductionR  sK    {{Gg}5555''))))--/ # rr   c                x    U R                    H*  n[        U[        5      (       d   eUR                  5         M,     g ra   )r   rb   rc   r  rV  s     rj   r  (FusedSchedulerNode.swap_pw_red_dimensionY  s/    {{Gg}5555))+ #rr   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fra   rI  rL  r  r  s     rj   rk   4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>d  =       0''))T^^-= *D'')) 0
   .AAr   r   filterrn   r   r   r  fpsr  s      rj   r  !FusedSchedulerNode.estimate_flops^  K      $ 0	
 s8q=#h
rr   c                   U R                  5       (       a  gSnU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          gUR
                  S   nMj     SnUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g[        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g)	z0
Return true if a loop reordering is performed.
FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)rI  r   rb   rc   r   r  r  rc  r   r  r  r  r'   r  r  rD  )r  r  r  r  r;  r  s         rj   r  ,FusedSchedulerNode.reorder_loops_by_dep_pairp  sK    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-rr   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ ra   )r   rd   r0  s    rj   rp  -FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C/Drr   rk  )r  rs  rM  r	  r  r{   )r  r  r   r  s      rj   rs  FusedSchedulerNode.__init__  s6    #0%'
%DEKK
rr   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r  r   r  r  rx  s     rj   r  FusedSchedulerNode.get_name  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ r   r   r  r  s    rj   r  !FusedSchedulerNode.get_first_name      {{1~&&((rr   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ra   r   rC  r   r  rp  s     rj   r  #FusedSchedulerNode.get_buffer_names  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ ra   r   r  rv  r  r!  re   s      rj   rv  FusedSchedulerNode.get_outputs  /    (*KKDMM$**,-  rr   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r  rn  )r  r   r  r#  re   r  r  rr  r  r  r  )r  r  re   r  s       rj   r  "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r   r  )r  re   
snodes_strs      rj   r  "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g ra   )r  r  r   r   r   updater]  )r  r  r  re   r  s       rj   r  !FusedSchedulerNode.set_last_usage  sQ    
 	2G 0:|T[[)D 3H&&t7 *rr   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ra   )r   rC  r   r   rp  s     rj   r   $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr|  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ra   )r   rC  r   r  rp  s     rj   r  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr|  c                    U R                   $ ra   r  r  s    rj   rn   FusedSchedulerNode.get_nodes  r  rr   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r{  r|  r  s    rj   r}  FusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )rd   rH  s     rj   rk   2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     9[>>##[r  r   r   r  s    rj   rd   FusedSchedulerNode.is_reduction  s    9T[[999rr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )rC  rH  s     rj   rk   6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>  s     =A%%''r  r  r  s    rj   rC  #FusedSchedulerNode.is_native_matmul  s    ====rr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )rF  rH  s     rj   rk   3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :k??$$kr  r  r  s    rj   rF   FusedSchedulerNode.is_split_scan  s    :dkk:::rr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   r  rH  s     rj   rk   1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8Kq==??Kr  r  r  s    rj   rI  FusedSchedulerNode.is_template  s    8DKK888rr   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g ra   )r   rI  r  r  re   s     rj   r  $FusedSchedulerNode.get_template_node  s3    KKD!!--//   rr   c                     U R                   S   $ r   )r{   r  s    rj   r   FusedSchedulerNode.get_device  s    zz!}rr   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )r  rH  s     rj   rk   >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//r  r  r  s    rj   r  +FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErr   c                    [         era   NotImplementedError)r  r  s     rj   r  'FusedSchedulerNode.update_mutated_names      !!rr   c                    [         era   r  )r  r   s     rj   r  FusedSchedulerNode.add_fake_dep   r  rr   c                    [         era   r  rR  s     rj   rT  FusedSchedulerNode.can_inplace  r  rr   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r  r  c              3  L   #    U  H  n[        U5      R                  v   M     g 7fra   )r   r   r"  s     rj   rk   /FusedSchedulerNode.debug_str.<locals>.<genexpr>	  s     F+QQ 0 0+s   "$r  r  r  r  r  r  r  z.outputs = [
            Nr  r  Tr  )r  r  r   rK   r  r   r   r  r   r  rk  r   r  rv  r#  r  r  r  r  r  r   r  )r  r   node_typestrr   r   s        rj   r#  FusedSchedulerNode.debug_str  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F)c                r   > U R                   b  [        S U R                    5       5      $ [        TU ]  5       $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )rW  r  s     rj   rk   6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>#  s     G;4,,..;r  )r   r   r  rW  r3  s    rj   rW  #FusedSchedulerNode.has_side_effects   s0    ;;"G4;;GGGw'))rr   )r{   r	  r   rX   r   rX   r   r   r8  rP  r%  r  )r  r  r   r(  r   rQ  rN  r  r   rc  r  r"  rR  r'  )r   torch.devicer  )r   r.   r   rQ  r#  )'r   r   r   r   r   rV  r   rS  r  r  rC   r  r  rs  r  r  r  rv  r  r  r  r   r  rn   r}  rd   rC  rF  rI  r  r   r  r  r  rT  r#  rW  r   r  r  s   @rj   r   r   '  s    $#+%+.?+	+ +B,
  "(!(.7(	(TL = =) N N	B/8#28HV8	8 O O 
 

A : : > > ; ; 9 9   F F
"""*4 * *rr   r   c                  V   ^  \ rS rSrSU 4S jjr      SS jrS	S jrS	S jrSrU =r	$ )
FusedMixOrderReductionsi'  c                   > Xl         X l        [        TU ]  UR                  [        UR                  5       5      [        UR                  5       5      -   5        [        R                  U R                   5      U l	        g ra   )
r   r   r  rs  r  r   rn   r]   r   numel)r  r   r   r  s      rj   rs   FusedMixOrderReductions.__init__(  sX    

OOT%//"34tEOO<M7NN	
 '00<
rr   c           	        [        U[        5      (       a   e[        U[        5      (       a   eU R                  R                  XSS9(       d  gSS jn    SS jnU(       a/  U" X45      U" U5      -  (       d  U" U5      U" X45      -  (       a  gUR	                  5       (       + =(       d@    [
        R                  " [        U R                  R                  XSS95      U R                  :  $ )z
node1 is from the current mix order reduction; node2 is another node we want to fuse in.

other_nodes are passed in to check if fusion will introduce producer/consumer relationship
between the inner and outer reduction. If yes, we don't fuse.
Fallow_mix_order_reductionc                B    [        5       nUR                  " S U  5       6 $ )Nc              3  8   #    U  H  oR                   v   M     g 7fra   )r   r"  s     rj   rk   TFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>G  s     :Eq{{Er  r   rC  r  r   s     rj   _get_ancestorsAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorsE  s    ,C99:E:;;rr   c                B    [        5       nUR                  " S U  5       6 $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7fra   )r   r"  s     rj   rk   ZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>M  s     F14466r  r  r  s     rj   _get_operation_namesGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_namesI  s      ,C99FFGGrr   )count_bytes)r  tuple[BaseSchedulerNode, ...]r   r\  )
rb   r  r  r   rd   typingcastr   score_fusion_memoryr  )r  r   r   other_nodesr  r  s         rj   sub_node_can_fuse)FusedMixOrderReductions.sub_node_can_fuse0  s     e%<====e%<====
 ~~&&uu&U	<	H0	H	H ~.1Ek1RR{+.BE>.RR ""$$ {{T^^77RW7X zz	
rr   c                   [        U[        5      (       dU  U R                  U R                  XR                  45      =(       d'    U R                  U R                  XR                  45      $ U R                  U R                  UR                  U R                  UR                  45      =(       a/    U R                  U R                  UR                  [        5       5      $ ra   )rb   r  r  r   r   r   r  others     rj   can_fuse_with%FusedMixOrderReductions.can_fuse_with]  s    %!899))

EJJ= J''

EJJ=IJ ))

EKK$**ekk)B K((U[[%'JKrr   c                b   U R                   R                  5       nU R                  R                  U5      n[	        U[
        5      (       aW  UR                  U R                   UR                   5      nUR                  U R                  UR                  5      n[        XE5      $ U R                  U R                   XR                  45      (       a1  UR                  U R                   U5      n[        X`R                  5      $ UR                  U R                  U5      n[        U R                   U5      $ ra   )	r   r   r  r  rb   r  rS  r   r  )r  r  r:  backendfused_node1fused_node2rb  s          rj   	fuse_with!FusedMixOrderReductions.fuse_withi  s    &&(..,,V4e455!,,tzz5;;?K!,,tzz5;;?K*;DD%%djj%**GG$\\$**e<
.z::FF$\\$**e<
.tzz:FFrr   )r   r   r  ri  )r   rX   r   rX   r  r  )r  rX   )
r   r   r   r   rs  r  r  r  r   r  r  s   @rj   r  r  '  s<    =+
 +
 !+
 3	+
Z
KG Grr   r  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodeiz  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g ra   )rv  r  read_to_node)r  producerr   s      rj   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for  sG     '')C||~!2!22((88 * rr   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr   )r   rX   r   r   r   r  r6  r  name_to_noder  r   r  r%  )r  consumer	producersrd	node_names        rj   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((rr   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7fra   )r  r   )rh   lrr  s      rj   rk   6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)rZ  rO  r  r  r  r   r   rm   r  rd   r  r  r   r  r  )r|   r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rj   r   #ForeachKernelSchedulerNode.can_fuse  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
rr   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rO  r  r  r  r  r  r  r   r   rS  r  r   r  r  r  )r|   r  r  r  r  r  r  r  r  fused_nodesr  re   new_noder   s                 rj   rS  ForeachKernelSchedulerNode.fuse  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     T R                   VVVs0 s H(  oR:                  R=                  5         H  u  pX_M	     M*     snnnT l        UT l        US   RA                  5       nU(       d   eU[B        RD                  " S5      444T l#        [         [H        RJ                  RL                     " 5       T l'        UT l(        g s  snnnf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7fra   r=  r  s     rj   rk   6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>.	  s5        xxt'<'<'>>	 C r@  r   combo_kernel))r  r  r  rs  r   r   r   r   r  r   re   r	  r  r%   rA  rB  r   rC  rk  r  r  r^  r  r_  rO  rb   r  r   r  rf  itemsr  r   r   Exprr{   r  fxNoder  r  )r  r  r   r  r  r  r  re   rv  r   foreach_noder   r;  r  vr:  r  s   `               rj   rs  #ForeachKernelSchedulerNode.__init__	  s    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 #'++@"-:O:O:U:U:W$!:W+@D  *C&%%'v

> :<>?
!%((--02.@s   /Lc           
        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H,  n[        U[        [        [        [        45      (       a  M*  UPM.     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     n	nU	(       a   [        R                  S[	        U	5      U	5        U Vs/ s H  o"U	;  d  M
  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d grouped nodes are filteredz;ComboKernels: %d FusedMixOrderReductions nodes are filteredz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)rb   r  r  rc  r   re   r  rL  r  rp  r  rI  )
r|   r  rx  externre   grouped	mix_orderfiltered_nodesforeach_nodesrQ  s
             rj   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodesT	  s    #OUj4M&N!UOIIAF5;UVTyy(&&(VU
 $Kez!5I'J1eKII=G !&P1A7N)OQ	PIIMI 
*-(+	  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GIIBN#
 &4O^7N!^Oc P
 VK Q



 H Psj   III#II)II8I*)II#I I2III$7I$*	I)7I)c                   U R                  5       n/ nSn[        U VVVs/ s H>  nU  H4  n[        U[        5      (       d  M  UR	                  5         H  nUPM     M6     M@     snnn5      nU H  n[        [        5      n	U Hi  nUR                  5       n
U
(       a"  U
R                  S:X  d  U
R                  S:X  a  M<  UR                  5       U-  (       a  MV  X   R                  U5        Mk     U	R                  5        H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXX-    PM     sn5        M?     M     U$ s  snnnf s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   mpsr8  r   )_topological_sort_nodesr   rb   r  r  r   r   r   r   r   r   r   r  r  r   )r  sorted_nodesgrouped_nodesmax_num_nodesr{   re   r1  excluded_buffer_namesr  device_groupsr:  device_nodesr  s                rj   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels	  sZ    !88:1; *)E!Dd$;<  !% 5 5 7H	  !8	 ! )2
 "E D!  *v{{e3v{{e7K ))+.CC%,,T2  !. 4 4 6$$ "'q#l*;]!K!KA %):;!K !7! ". ?4s   E"E'E4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g ra   r  r+  )custom_group_algorithms    rj   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels	  s    
 # 	#Drr   c                ,    [         R                  U 5      $ ra   r-  r  s    rj   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels	  s     *KKIVVrr   c                    [         era   r  r  s    rj   r  #ForeachKernelSchedulerNode.mark_run	  r  rr   c                    [         era   r  r  s    rj   rj  "ForeachKernelSchedulerNode.codegen	  r  rr   c                    gr  r   r  s    rj   rO  %ForeachKernelSchedulerNode.is_foreach	  r  rr   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r   r   r  s    rj   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes	  s     DKK  rr   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7fra   )rn   rH  s     rj   rk   7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>	  s     1UA++--r  )r   r  r  r	  r   r  s    rj   rn   $ForeachKernelSchedulerNode.get_nodes	  s(     IOO111U1UUVVrr   c                <    U R                   S   R                  5       $ r   )r   r  r  s    rj   r  )ForeachKernelSchedulerNode.get_first_name	  s    {{1~,,..rr   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g ra   )r  r  r6  r   r	  )r  r  re   s      rj   r	  /ForeachKernelSchedulerNode.prune_redundant_deps	  s5     	d8R8RSKKD%%&89  rr   )r   r  r{   r_  r^  r  re   r  rf  r  r  r   rk  r  r	  )r  rX   r   r  )r  rX   r   r  r  rX   r  rX   r   r   )r  rX   r  rX   r   r  )NNF)r  r  r   r(  r  r   r  r  r  r  r  r   r   rQ  r  r(  r   r(  )r  r  r   list[list[BaseSchedulerNode]])r.  r*  r   rQ  rP  rR  r   r(  r"  rN  r   )r   r   r   r   r   r  r  r   r   rS  rs  r  r   r(  r+  rV  r/  r3  r  rj  rO  r<  rn   r  r	  r   r  r  s   @rj   r  r  z  s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/ F/P 4+4	 4 4l **	&* *\ 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :rr   r  c                     ^  \ rS rSr% SrS\S'   \SS j5       r S       SU 4S jjjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jr\SS j5       rSS jrSS jr\SS j5       rSrU =r$ )rL  i	  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
r(  r   c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7fra   r2  )rh   re   r  s     rj   rk   .GroupedSchedulerNode.create.<locals>.<genexpr>	  s     B64>>Y.6s   )r  rm   r  r  )r|   r   grouped_snoder;  r  s       @rj   createGroupedSchedulerNode.create	  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>rr   c                H   > [         TU ]  U5        [        XU5        X0l        g ra   )r  rs  rM  temp_grouping)r  r  r   rR  r  s       rj   rs  GroupedSchedulerNode.__init__	  s$     	#0 +rr   c                B   U R                   (       a  U R                  $ U R                   H)  nXR                  R                  UR	                  5       '   M+     U R                  R                  U R	                  5       	 U R                  R                  U R                  5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)rR  r   r  r  r  
fuse_nodes)r  r;  s     rj   unpackGroupedSchedulerNode.unpack
  so    
 ;;[[EBGNN--enn.>? !NN--dmmo>~~((55rr   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g ra   )r  r   r  rk  r  )r  fake_deps     rj   r  !GroupedSchedulerNode.add_fake_dep
  s5    T--77AB##H-rr   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf rm  ro  rp  s     rj   r  GroupedSchedulerNode.get_name
  rr  rs  c                <    U R                   S   R                  5       $ r   ru  r  s    rj   r  #GroupedSchedulerNode.get_first_name#
  rw  rr   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf ra   ry  rp  s     rj   r  %GroupedSchedulerNode.get_buffer_names&
  r{  r|  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ ra   r~  r  s      rj   rv   GroupedSchedulerNode.get_outputs*
  r  rr   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7fra   r\  r  s     rj   rk   6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>6
  r^  r_  r   r`  rb  s      rj   r  #GroupedSchedulerNode.estimate_flops0
  re  rr   c                    U R                   $ ra   r  r  s    rj   rn   GroupedSchedulerNode.get_nodesB
  r  rr   c                b    U R                   (       a  U R                   S   R                  5       $ S $ r   )r   r   r  s    rj   r   GroupedSchedulerNode.get_deviceE
  s$    .2kkt{{1~((*CtCrr   c                    gr  r   )r|   r  r  s      rj   r   GroupedSchedulerNode.can_fuseH
  r  rr   )rR  )r   r(  r   rL  )F)r  r  r   r(  rR  r   r   rQ  rI  )rY  r.   r   rQ  rN  r  r  r%  r"  rT  rF  )r   r   r   r   r   rV  r   rO  rs  rV  r  rC   r  r  r  rv  r  rn   r   r   r   r  r  s   @rj   rL  rL  	  s     $#  $	++ (+ 	+
 
+ +6. = =) N N  "D  rr   rL  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr   Nr   rh   sl_asl_bs      rj   rk   5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>e
  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7frp  r   rq  s      rj   rk   rt  h
  ru  rv  r   )rD   absr   r  )	r4  bslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rj   	index_cmp"pick_loop_order.<locals>.index_cmpX
  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   rk  )r4  r   ry  r   r   r   )		functools
cmp_to_keyr   r   r  r   r#   pick_loop_orderssort)r  r  priority_idxr  orderpis   ``    rj   pick_loop_orderr  N
  s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g ra   )r  rb   r   r  rS   rv   r  r   
name_to_opoperation_namebuffersr   remove
operations)	orig_noder	  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rj   _replace_operation_bufferr  |
  s]    !))+&&(MmS))j9JC.P.PPP224//1LlC((Z8H#-N-NNN	01!M	+,*77??  +DGGOO8$$AGGOOD,4AGG=)77##I.DGGh''AGGt'/AGG|$rr   c                j    UR                  5       nU R                  5       nX4-
  nXT-  nUSU-   -  nXr-  $ r  )r  r  )r   r   epilogue_runtimetotal_read_bytestemplate_write_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           rj    _estimate_fused_epilogue_runtimer  
  sO     224 779"9K#: +a2C.CD00rr   c                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri
  $Union[BaseSchedulerNode, OutputNode]re   Fr   rT  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ ra   )r  re   r  rT  r  r  s    rj   r  NodeUser.__hash__
  s+    TYY'')4+;+;T\\JKKrr   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ ra   )rb   r  r  rT  r  r  s     rj   __eq__NodeUser.__eq__
  s[    uh' .5>>#33.  E$5$55. -		
rr   c                6    U R                   R                  5       $ ra   r&  r  s    rj   r  NodeUser.get_name
  r(  rr   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ ra   )re   r  rT  r  r  s     rj   rD  NodeUser.merge
  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rr   r   NrO  )r  objectr   r   rN  )r  r  r   r  )r   r   r   r   rV  rT  r  r  r  r  rD  r   r   rr   rj   r  r  
  s3    
..K GTL
$
rr   r  c                 "    [         R                  $ ra   )r#   rQ  r   rr   rj   *used_non_deterministic_runtime_estimationsr  
  s    333rr   c                   [        5       nU R                  5       n[        U[        R                  5      (       a  UR                  [        UR                  5      [        UR                  5      -  [        UR                  5      -  5        [        U[        R                  5      (       a$  UR                  [        UR                  5      5        U$ Ub
   SU 35       eU$ )z=Get free symbols from a node's layout (size, stride, offset).z*Expect layout to be None but found layout=)r   maybe_get_layoutrb   r&   Layoutr  r   r   strideoffsetrq  get_layout_symintsr  )re   free_symbol_usesr  s      rj   r  r  
  s    1;""$F&"))$$%6==)*6==)*	

 fb;;<<##$6v}}$EF  ~T!KF8TT~rr   c                .   [        U [        5      (       a(  [        5       R                  " S U R                   5       6 $ U R
                  c   eU R
                  R                  5       nUR                  " S U R
                  R                  5        5       6   U$ )z{
Gets symbols used in a scheduler node, including free symbols from
the node's operations and layout symints from outputs.
c              3  8   #    U  H  n[        U5      v   M     g 7fra   get_scheduler_node_symbol_uses)rh   r;  s     rj   rk   1get_scheduler_node_symbol_uses.<locals>.<genexpr>
  s     M,U33r  c              3  8   #    U  H  n[        U5      v   M     g 7fra   )r  )rh   ir_nodes     rj   rk   r  
  s     	M5L'
W
%
%5Lr  )	rb   r   r   rC  r   re   get_free_symbol_usesr  rv  )re   r  s     rj   r  r  
  s     $*++|!!MM
 	
 99   yy557	MTYY5J5J5L	M rr   c                  >  ^  \ rS rSrSrSSS jrSSU 4S jjrSTS jr\SUS j5       r	\	R                  SVS j5       r	SWS jrSXS	 jrSYS
 jrSWS jrSWS jrSWS jrSWS jr    SZS jrS[S jrS\S jrSWS jrSWS jrSZS jrSWS jr    S]S jr S^       S_S jjr      S`S jrSWS jr          SaS jrSbS jr      ScS jrSdS jr       SeS jr!S^SfS jjr"SgS  jr#      ShS! jr$      SiS" jr%      SiS# jr&        SjS$ jr'      SiS% jr(        SkS& jr)      SlS' jr*      SlS( jr+SmS) jr,        SnS* jr-      SoS+ jr.  Sp         SqS, jjr/      SiS- jr0        SrS. jr1SsS/ jr2StSuS0 jjr3   Sv           SwS1 jjr4    SxS2 jr5    SyS3 jr6SWS4 jr7SWS5 jr8SWS6 jr9SzS7 jr:S{S8 jr;S|S9 jr<S}S: jr=      S~S; jr>SS< jr?  SS= jr@    SS> jrA      SS? jrB      SS@ jrC    SSA jrD    SZSB jrE    SZSC jrF    SZSD jrG  SSE jrH      SSF jrIS}SG jrJSWSH jrK      SSI jrL      SSJ jrM      SSK jrNSWSL jrOSgSM jrP    SSN jrQSSO jrRSSP jrSSWSQ jrTSRrUU =rV$ )r  i
  z
A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
optimizations such as fusion, reorder, and graph partition.
c                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initr  r  s     rj   rs  Scheduler.__init__
  s#    ./JJu 0//s   '
5c           
       >^  [         TT ]  5         T [        R                  l        0 T l        [        [        5      T l        [        R                  " 5       T l        [        5       T l        [        / [        R                  R                  R                  5       Q[        R                  R                   R                  5       Q[        R                  R"                  R                  5       Q5      T l        U Vs/ s H  nT R'                  U5      PM     snT l        S T l        S T l        T R/                  5         T R$                  R1                  [        R                  R                   R                  5       5        T R(                   H  nUR3                  5         M     S T l        T R7                  5       T l        T R(                   Vs0 s H  o"R;                  5       U_M     snT l        T R(                   VVs0 s H*  o3R?                  5         H  oDR;                  5       U_M     M,     snnT l         T R<                  RC                  5       T l"        0 T l#        0 T l$        [J        RL                  " T R(                  T R@                  T RD                  5      T l        T RO                  5         T RQ                  T R(                  5      T l        T RS                  5         T R(                   Vs0 s H  o"R;                  5       U_M     snT l"        T RU                  5         [V        =RX                  [[        T R(                  5      -  sl,        SSK.J/nJ0n  U" T R(                  5        [[        T R(                  5      T l1        T Re                  5         T RQ                  T R(                  5      T l        [        [f        [h        [h        4      " 5       T l5        [l        Rn                  b%  [l        Rn                  " T R(                  5      T l        [l        Rp                  (       a'  SSK9J:n  URw                  T 5        T RU                  5         T Ry                  T R(                  5      T l        [l        Rz                  b%  [l        Rz                  " T R(                  5      T l        T R}                  5         T R                  5         [l        R                  (       d  [l        R                  (       aG  [l        R                  (       a2  [        R                  R                  R                  R                  5         [l        R                  (       a#  [        SSSS9   T R                  S S9  S S S 5        [l        R                  (       a  SS	KLJKn  U" T R(                  T R@                  T RD                  [        [        R                  R                  R                  5       5      [        [        R                  R                  5       5      5      T l        [l        R                  (       Gd&  [l        R                  (       Ga  [l        R                  (       d#  SS
KLJPn	  U	" T R(                  T R@                  5        [        5       (       a  [        R                  (       a|  [l        R                  (       d  [        R                  (       aR  Sn
T R(                   H!  n[        UR                  5      (       d  M  Sn
  O   U
(       a  SSK%JXn  U" T R(                  5        SSKYJZn  U" SS U 4S jS9  [J        R                  " T R(                  5      T l        T R                  5         [l        R                  (       a~  [l        R                  R                  (       a_  [l        R                  R                  (       a@  T R                  T R(                  5      T l        T R                  T R(                  5      T l        T R                  5         [        R                  Rl                  R                  R                  (       a  T R                  5         U" T R(                  5        [        R                  R                  T R(                  5        T R                  5         [        5       T lj        0 T lk        [        S5      R                  U 4S j5        [        5       T ln        g s  snf s  snf s  snnf s  snf ! , (       d  f       GN= f)Nr   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotunez#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     SSS.$ )N#scheduler_nodes_before_comm_overlapstring)r   encodingr   r   rr   rj   rp  !Scheduler._init.<locals>.<lambda>  s    A (%rr   c            
        > SR                  [        TR                  5       V Vs/ s H0  u  pSU  S3UR                  5       -   SUR	                  5        3-   PM2     snn 5      $ s  snn f )Nz

zsnode[r  z buffer_names:)r  r  r  r#  r  )r  r#  r  s     rj   rp  r    so    6;;
 %.djj$9	 %:DA !1++-(*1+=+=+?*@AB %:	$s   7A$
)metadata_fn
payload_fngraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  r  s   rj   rp  r    s%     33+/+>+>*-djj/rr   )or  rs  rS   rv   r  backendsr  _post_grad_graph_counterr  r  count_graph_partition_counterr   rm  r  r   	constantstorchbind_constantsr  create_scheduler_noder  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr5  r  r  rv  r6  copyr  r  rb  r"   decide_global_ordering_of_commsrV   topological_sort_scheduledead_node_eliminationcompute_ancestorsr'   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r  create_foreach_nodesr   r   logged_slow_fusionr#   _pre_fusion_custom_passdistributed_max_autotune_gemmr  r  schedulerU  _post_fusion_custom_passr  finalize_multi_template_buffersmax_autotune_gemmmax_autotunepipeline_max_autotune_gemmr  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_namesdeterministic reorder_for_compute_comm_overlapr  r  r$   6runtime_estimations_align_across_all_distributed_ranksrQ  r  rL   re   r  torch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   rW   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesrc  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)r  r  r#  re   r   r  r  r  r  r  has_collectivesr  r  r  s   `            rj   r  Scheduler._init
  s4    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCUd003UC
:>9='')##**177+<+<+A+A+CDJJDOO  ?C# $$& 	# &*ZZ;
%/JJL!OZ;

 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G  	##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ//. ))$/""$__TZZ0
**688DDJ,,.$$(;(;//OO,,AASSU5&* $
 ..D.A ))70

  ''177//44671773356DJ ###(O(O(O11UAJJ 0 0
 ;<< WW<<#PP #( JJD$TYY//*. ' # K4::V7 CCDJJODJ""$ ""((CCDDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
 -7LC D;
8
B #HB s$   6b<c61c&cc
c c                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)r  )rS   rv   graph_inputs_originalrb   r&   DonatedBufferrZ  )r  name_to_donated_bufr   s      rj   r  Scheduler.get_donated_buffers  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"rr   c                6    [         R                  R                  $ ra   rS   rv   current_devicer  s    rj   r   Scheduler.current_device  s    ww%%%rr   c                .    U[         R                  l        g ra   r  r9  s     rj   r   r!    s    !'rr   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  rc  r&  r  )r  r&  s     rj   r  Scheduler.debug_draw_graph  s1    ::>>:DASH+6 Irr   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r  isEnabledForloggingINFOr  r  r  )r  labelre   s      rj   debug_print_nodesScheduler.debug_print_nodes  sD    GLL))HHUE"

  " # *rr   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_oprp  rb   r&   r   r  rc   rD  r  r  r  s     rj   r  Scheduler.create_scheduler_node  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++rr   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr  r  )r   r  r   rS   rv   listsr   rb   r  rp  r  r#   combo_kernels_autotuner  r   r  r  r   )
r  removed_node_namesfe_nodeskept_node_namesnamesr   r   r  fe_nodere   s
             rj   r  Scheduler.create_foreach_nodes  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                  ^ ^%^&^'  " U%4S jS[         [           5      m%[        R                  " T%5      m&T R                   H  nUR                  5        H  nUR                  5       n[        UR                  R                  [        R                  5      (       a  [        UR                  5       5      S:  a  Me  UR                  5        HW  nUT&;   a6  UT&;   a0  T&U   nT&U   nXV-   nT& H  nT&U   UL d
  T&U   UL d  M  UT&U'   M     M?  UT&;   a
  T&U   T&U'   MO  T&U   T&U'   MY     M     M     SU'U 4S jjm'  S         SU&U'4S jjjn	0 n
[        R                  R                   R#                  5        H  n[        U[$        R&                  5      (       a  UR(                   H  nSX'   M	     M;  [        U[        R*                  5      (       d  M\  UR-                  5        Vs/ s H&  n[        U[$        R&                  5      (       d  M$  UPM(     nnU H  nUR(                   H  nSX'   M	     M     M     SnT R                   Hz  nUR                  c   e[/        UR                  R1                  5       S S	9nU H?  n[        U[$        R2                  5      (       d   eS
nX;  d  M-  UR                  5       X'   MA     M|     T R                   GH9  n[4        R7                  SUR                  5        U(       a  UR                  c   e[/        UR                  R9                  S
S9S S	9nU Hi  nX;   d   U SU
 35       eX   =nc  M  T R:                  U   R                  5        H+  nUR=                  [?        UR                  5       5      5        M-     Mk     [        UR@                  RB                  5      S:X  aQ  [E        [G        UR@                  RB                  5      5      =n(       a"  [        U[H        5      (       a  URJ                  nOSnUR                  5        GH  n[        URM                  5       5      S::  d   eURM                  5        H  nT'" U5      nU	" UU5        UR=                  [?        UUS95        T&U   RN                   H  nUR                  5       UR                  5       :X  a  M'  [        UR                  [P        5      (       d   eUR                  RS                  5        H:  nT'" U5      nUR=                  [U        UUR                  5       S95        U	" UUS
S9  M<     M     M     GM     [        R                  RV                  UR                  5           H3  nU	" UUS
S9  UR=                  [U        UUR                  5       S
S95        M5     [        R                  RX                  UR                  5           H%  nU	" UUSS9  UR=                  [?        U5      5        M'     UR@                  RZ                   H<  n[        U[T        5      (       a  M  U	" UR\                  XR_                  U5      5        M>     URa                  T Rb                  5        UR                  5        H  nURM                  5        Hz  nUR                  5       T Rb                  T'" U5      '   UR                  5       T Rb                  U'   T Rd                  Rg                  UU5      T Rd                  UR                  5       '   M|     M     GM<     [        R                  Ri                  5        H4  n[4        R7                  SU5        U	" U[k        [?        U5      5      5        M6     U(       a  [        R                  Rl                   H  nUR9                  S
S9 H  nX;   d   U SU
Ro                  5        35       eX   =n(       d  M/  T R:                  U   RS                  5        H5  n[4        R7                  SUU5        U	" U[k        [?        U5      5      5        M7     M     M     T Rb                   H  nU[        R                  R                   ;   aF  U	" U[k        [?        U5      5      5        [        R                  Rp                  Rs                  U5        Mg  U[        R                  Rt                  ;   d  M  U	" U[k        [?        U5      5      5        M     [w        [        R                  R                   Ro                  5       5       VVs0 s H	  u  nnUU_M     nnn[        R                  Rp                   Vs/ s H  nUU   PM
     sn[        R                  l<        T R                   HF  nUR                  5        H/  nUR{                  T&UR                  5          RN                  5        M1     MH     T R|                   H.  nT R|                  U   R{                  T&U   RN                  5        M0     [        5       n U R                  S5        T&RO                  5        Ha  u  nn!U R                  5          U!RN                   V"s/ s H  n"U"R                  5       PM     n#n"U R                  SU SU# S35        SSS5        Mc     U R                  S5        U R                  5       R                  5       n$[        R7                  S5        [        R7                  SU$5        gs  snf s  snnf s  snf s  sn"f ! , (       d  f       M  = f)zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
c                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi$  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g ra   )r  r   
membership)r  r  rC  s      rj   rs  :Scheduler.compute_dependencies.<locals>.DedupList.__init__.  s    
 #[b
","<
rr   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g ra   )rC  r  r   r  )r  	node_users     rj   r   8Scheduler.compute_dependencies.<locals>.DedupList.append6  s3    /

!!),##I.rr   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf ra   )r   rC  rC  r  )r  r  new_membershiprx  	new_items	DedupLists        rj   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__<  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)r  rC  r5  )r  zOptional[list[_T]]rC  zOptional[OrderedSet[_T]]r   rQ  )rF  rZ   r   rQ  )r  DedupList[_T]r   rN  )	r   r   r   r   r   rs  r   rL  r   )rK  s   rj   rK  rA  $  s@     -17;=)= 5= 	=/< <rr   rK  r   c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ ra   )rb  )r#  r  r  s    rj   r  .Scheduler.compute_dependencies.<locals>.renamei  s,    D)))d33A677Hrr   Fc                N   > TT" U 5         R                  [        XU5      5        g ra   )r   r  )used_by_namerc  rT  r  name_to_usersr  s       rj   add_user0Scheduler.compute_dependencies.<locals>.add_usern  s'     &./669rr   Nc                    U R                   $ ra   r  r0  s    rj   rp  0Scheduler.compute_dependencies.<locals>.<lambda>  s    AFFrr   rk  Tzscheduling %s)unbacked_onlyc                    U R                   $ ra   r  r0  s    rj   rp  rW    s    !&&rr   z not in )r+  )mutating_buf)r  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sr  'z': r  r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r#  r   r   r   )FF)
rR  r   rc  r  rT  r   r  r   r   rQ  )Er   rZ   r  r   r  rv  r  rb   re   r  r&   r:   r   r  rS   rv   r  r   r   r  r   	TensorBoxr  rp  get_unbacked_symbol_defsSymbolr  rc  r  r  r  r0   r   r  r  r%  r/   r+  r  r  rX   r  r1   additional_buffer_depsadditional_star_depsr   r   rT  r  rb  r  r  r  r>  graph_outputsr   mutated_inputsr  r  r  mutated_input_idxsrE  r5  rK   r  r  r   r  compute_dependencies_log)(r  re   buf1	buf1_name	buf2_namelist1list2combinedrl  rT  unbacked_symbol_to_origin_nodevalfsr  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r   	node_modealt_namer"  
other_nameadd_deprv  r1  r   r   r   	inp_nameslogbufr  r  r	  r   rK  rS  r  s(   `                                    @@@rj   rV   Scheduler.compute_dependencies  sY	   	< 	<@ @K?V?V@
 JJD((* MMO	 tyy//??D,,./!3!%!1!1!3I M1i=6P -i 8 -i 8#(=#0C -c 2e ;#0#5#>5=c 2 $1 #m33@3Ki03@3Ki0 "4 + <	 	 !&!				 <		 			
 		 		 		 MO&
 77''..0C#uzz****B9=26 +C.. (+||~S~!Auzz9RA~S!Ann=A6: - " 1 ',#JJD99((( $*		224:J$  *!!U\\2222 /3+:8<25 *   JJDIIotyy1*yy,,,'-II222F(($
 .A> #X&D%EF> <>>K#'#4#4Q#7#C#C#EC --gclln.EF $F . D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG*.))*D*D*FJ)/
);J -- '
 P %ZtD +G !> !4 ** 7799$--/J$5 !!''4==?D"QR	 K 7777H$6!!''"23 I
 ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *I Z 002HII,h7Xz'(*;<= 3
 'ww,,111EA> #X&D&I&I&K%LM> ;==q=(,(9(9!(<(M(M(OHII M ( !
 %Xz'(:K/LM )P F - ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4D%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0  !c'--/JC/4{{;{!{;#c%23 ! 0 	c  "))+ &&';< &&'I3Os T@
&
" < !s6   "#k	kk6k%0k/?k*k/*k//
k>	c           
     2  ^ ^ SSK JnJnJnJn  [        [        R                  R                  R                  5       5      nU" T R                  U5      n[        R                  R                  R                  (       d  U" T R                  T R                  5        [        [        R                  R!                  5       5      nU" T R                  UU5      u  n  n	[#        [%        T R                  5      5       V	s/ s H  n	/ / 4PM	     sn	mU H  n
U
R&                  S:X  a  U
R(                  S:X  a  M%  U
R*                  R-                  5       nTU
R.                     S   R1                  U5        TU
R2                     S   R1                  U5        M     SSKJn  U" 5               SU U4S jjn/ n[9        T R                  5       HE  u  nnUR1                  U5        UR1                  U" X[%        T R                  5      S-
  :H  S95        MG     UT l
        g s  sn	f )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                N  > TU    S   nTU    S   nX#U/n[         R                  " [        [        R                  " S5      S9[        R
                  R                  R                  R                  / US S9nSTR                  U    R                  5        3Ul        [        TU5      $ )Nr   r   r8  )r:  c                $    U US   US   US   S.4$ )Nr   r   r   )alivedeadis_final_stepr   )tensor_argsr8  s     rj   rp  WScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>X  s(    !.q!1 -a 0)6q)9Crr   )r  r3  r  nontensor_argsunflatten_args
mem_check_)r&   MemoryCheckKernelr:   r  r:  r?  _inductor_debugcheck_memory_stepdefaultr  r  r  r  )step_idxr  expected_newly_aliveexpected_newly_deadr  re   r  step_allocs_deallocss         rj   construct_mem_check_nodeEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_nodeK  s     $8#A!#D "6x"@"C2WN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rr   )r  )r  r   r  r   r   r  )r  r  rz  r{  r|  r   rS   rv   r  r   r  r  r  r#   r  r6  r  r  r   
size_alloc	size_freer  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr}  r  )r  r  rz  r{  r|  r  name_to_freeable_input_bufra  buf_info_listrn  buf_infor1  r}  r  	new_nodesr  re   r  s   `                @rj   r  #Scheduler.insert_memory_check_nodes!  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
4RH4C
 &H""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG & 	N	9	9*.	9&	9 	92 	 ,GAtT"(DJJRS@S;SU - 
eC
s   8Hc                  ^	 [         R                  (       d  g/ n[        U R                  5       GH  nSS jm	SnUR	                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [        R                  SUR                  5       5        [        R                  R                   R                  UR                  5       5        UR"                  R$                   H  nUR&                  U R(                  ;   d  M  U R(                  UR&                     R                  nU Vs/ s H2  oR*                  R                  5       UR                  5       :w  d  M0  UPM4     snU R(                  UR&                     l        M     GM     [-        [        U5      5      U l        U R                   H  nUR/                  5         M     gs  snf )	z 
Remove any nodes without users
Nc                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ ra   )r  r  rS   rv   r  )r"  s    rj   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user{  s&    ||Tt}}!'':T:T'TTrr   Fc              3  4   >#    U  H  nT" U5      v   M     g 7fra   r   )rh   ur  s     rj   rk   2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %s)r"  r  r   r   )r#   use_dcer   r  rv  rm   r	  r  rc  r  rS   rv   rn  r  rW  r   r  r   r   r   r6  re   r   r  )
r  updated_nodesre   active_buffersr   can_eliminaterv  r	  r  r  s
            @rj   r  Scheduler.dead_node_eliminationn  s    ~~
 TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   
/I(=I(c                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ ra   r  )ds    rj   rp  DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>  s    affrr   rk  )r  rp  rk  r   r   )r#  r   r  r!  seenvisits     rj   r  2Scheduler.topological_sort_schedule.<locals>.visit  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  rr   )r#  rX   r   rQ  )r   rX   r  r  )r  r  re   r   r  r!  r  r  s       @@@@rj   r  #Scheduler.topological_sort_schedule  sj     +,.59V*,	! 	! D--/%)T" 0  D$K rr   c                N  ^  [        5       n[        U[        [        [        [
        [        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7fra   )r6  r  r  s     rj   rk   1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s(     XZc))#.??AAZr  c              3  B   >#    U  H  nTR                   U   v   M     g 7fra   r  )rh   r#  r  s     rj   rk   r    s     Q=at66q9=s   )r   rb   rc   r  rp  r   rL  rk  r  r   RuntimeErrorr   r   )r  r;  
unmet_depsr   unmet_dep_opss   `    rj   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes  s    &0l
)&"$	
 	
 //sxx( 0 =d5k]!L  YZXJQ=QQRRrr   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	r  fromkeysr  r  r   r  r   r  r  )r  r  r  childrenre   r  r   cr#  r  zero_deg_nodesr"  s               rj   r!  !Scheduler._topological_sort_nodes  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                j   0 nU R                    Hw  n[        5       nUR                   HB  nU R                  UR                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l        My     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)r  r   rk  r6  r   r  r  r  r   r  r^  r_  )r  name_to_ancestorsre   r   r   dep_node_namer  s          rj   r  Scheduler.compute_ancestors  s    
 9;JJD)3I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1rr   c                   [         R                  (       d  g U R                   H  n[        U[        [
        45      (       a)  UR                  5       (       d  [         R                  S:w  a  MI  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)r#   r  r  rb   rc   r   rN   cpu_backendrn   rI  r  )r  re   r;  s      rj   r  Scheduler.merge_loops  s    00JJD d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * rr   c                   [        SSSS9   [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  USS9n[        U5      n[        R	                  S	US-   UU5        XC:X  d  US:X  d  Mk  [        R	                  S
US-   5          O   [        R                  (       d  [        R                  (       a  U R                  USS9nUsSSS5        $ ! , (       d  f       g= f)z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodesTr  rM  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   rb  rc  fuse_nodes_oncer#   r  loop_index_inversion_in_fusion)r  r  r  old_lennew_lens        rj   rU  Scheduler.fuse_nodes  s     #4QU
 2Ye*  EE
 ,,UU,Ke*  TE	 %A$$Eq1u ' , 1188,,UT,J;
 
 
s   A3C%AC%%
C3c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)r  r  rb   rL  rV  )r  r  re   s      rj   r  Scheduler.process_grouped_nodes6  sF     .0	JJD!+D2F!G!GdV  
rr   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)r   r   r   r  r   r  )r  r  r:  r  s       rj   r  Scheduler.benchmark_fused_nodesA  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
s   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX@l        U R                  U5      n[	        S5         UR                  XUS9sSSS5        $ ! , (       d  f       g= f)r  r   r  hint_overrideN)r   r   r   r  r   generate_kernel_code_from_nodes)r  r  benchmark_kernelr  r:  r  s         rj   r  )Scheduler.generate_kernel_code_from_nodesS  si     5zA~~q$$&$""6*12::} ;  322s   A!!
A/c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)r  r  N)r   r  r   benchmark_codegened_module)r  moduler:  r  s       rj   r  $Scheduler.benchmark_codegened_modulef  s=     %""6*1255f= 322s	   >
Ac           
        [        U R                  5       GH  u  p[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pEO [        S UR                  5        5       5      n[        U[        R                  R
                  R                  5      (       a  [        R                   (       a  0 nXFS'   [        R                    Hm  nUR                  US9nUR#                  5        V	V
s0 s H  u  p[        U	[        5      (       d  M  X_M      nn	n
[%        UR#                  5       S S9S   nXU'   Mo     UR                  R'                  U5        OUR                  R)                  U5        GM  [
        R*                  R-                  UR.                  5         UR1                  5       nSSS5        WR2                  n[        U[
        R4                  5      (       d   eUR2                  n[        U[
        R6                  5      (       d   eUR8                  (       a  [;        XR8                  5        UR<                  Ul        U R?                  XX5        GM     gs  sn
n	f ! , (       d  f       N= f)aP  
Finalize a backing choice for MultiTemplateBuffers which did not already have a
choice finalized through fusion. In the case of an extern choice, this will result
in replacing the SchedulerNode.

If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
will force completion of compilation and benchmarking.
c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7fra   )rb   r  r  r  ExternKernelCaller)rh   timings     rj   rk   <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>  s7      *E) & % @ @ S S  #F*Es
   7A	ANr  c                    U S   $ r  r   r0  s    rj   rp  ;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>  s	    qQRtrr   rk  r   ) r  r  rb   rc   re   r&   MultiTemplateBufferr#   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   multi_kernel_hintsr  r  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsr  output_noder   
StorageBoxOperationBufferorigin_noder5   r  _replace_node)r  r  re   
multi_nodemin_node_unfusedrn  callershinttimingsr  r  triton_timingschoiceout_tensorboxout_storage
out_buffers                   rj   r  )Scheduler.finalize_multi_template_buffersr  s-    !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&??  00QS(8$*$=$=D&0&?&?d&?&SG -4MMO.,;DA#-a1I#J !%,; + .
 &))=)=)?^%TUV%WF,2DM %> 		<<WE		;;<LMYY..z/A/AB$4$@$@$BM C+00!+r}}====(--
!*b.@.@AAAA))&}6L6LM$.$5$5
!"":1Ck -:. CBs   ?J0
J0
.J66
K	c                  ^ [        X!5        U R                  U5      nXPR                  U'   XPR                  UR	                  5       '   XPR
                  UR	                  5       '   0 m[        R                  " UR                  R                  UR                  5       HA  nU R                  R                  UR                  S 5      =n(       d  M2  UR                  TU'   MC     SU4S jjnU" UR                  5      Ul
        U" UR                  R                  5      UR                  l	        [        UR                  5       UR                  5       5       H2  u  pXR                   U
R	                  5       '   U
R"                  U	l        M4     UR$                  Ul        UR&                  Ul        UR(                  Ul        UR*                  Ul        g )Nc                .   > [        U4S jU  5       5      $ )Nc              3  D   >#    U  H  oR                  T5      v   M     g 7fra   )r  )rh   r   rb  s     rj   rk   ?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>  s     Kdsjj)9::ds    r   )r  rb  s    rj   rename_deps,Scheduler._replace_node.<locals>.rename_deps  s    KdKKKrr   )r  rj  r   rj  )r  r  r  r  r  r  r  r  r   r   rk  r  r  r   r  rv  r6  r	  r^  r_  r   r]  )r  r  r  r  re   new_scheduler_noder   	real_namer  new_outold_outrb  s              @rj   r  Scheduler._replace_node  s{    	"*9!77
C*

1-?$--/*3E0 ??4#3#3#9#94;R;RSC 3377$GGyG.1hh + T	L 1<111
- 0;**000
&&, !$**,d.>.>.@!
G 4;W--/0#MMGM	!
 (,~~$'+~~$'+~~$(,%rr   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moder,  )r2  re   r   r  r"  s     rj   rk   ,Scheduler._any_atomic_add.<locals>.<genexpr>  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r   r  	node_lists     rj   _any_atomic_addScheduler._any_atomic_add  s     

 
 
 	
rr   c                4  ^ ^^^^^^^^^^^^ ^!^"^#^$^%^&^'^( [        S TT4 5       5      n[        R                  (       d  U(       d  gTR                  5       (       a-  [	        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  gTR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a  [        R                  S:w  a  gTR                  5       n[        [        R                  " XE5      5      nT R!                  U5      (       a  gSSKJm  ['        TT5      m(US   R                  5       mTc   eSUU4S jjm"[(        R*                  R,                  R/                  5       m S     SUU 4S	 jjjnU(       Ga  [        S
 TT4 5       5      (       Gau  TR                  5       SLmT(       a  TR                  5       OTR                  5       m'[	        T'[        R0                  5      (       d   e0 m!/ m [        R2                   GH@  nT'R5                  U5      m[7        TR9                  5       S S9 Hw  u  p[	        U	[(        R*                  R:                  R<                  5      (       d  M:  T'R?                  U	5         T RA                  U	/U" XiRB                  S9Q75        SSS5        My     [E        S5      nSn0 nT  HW  u  pn Ub  URG                  5         T'R?                  U	5         T RW                  UT5      u  nnUX'   UU:  a  UnU	nSSS5        MY     UT'RX                  U'   [	        U[Z        5      (       d   eUT!U'   GMC     T'R5                  5       mT'R]                  5       u  m#m$[E        S5      m%[        R^                  mT(       a-  T(       a  T Ra                  U5      OT Ra                  U5      u  m%nO%T(       d  gTRc                  5       m%[e        TTT%5      m&/ m Sn[7        TR9                  5       [f        Rh                  " S5      S9 H  u  n	n[	        U	[(        R*                  R                  RZ                  5      (       d  M;  T(       d-  [k        U	S5      (       a  U	Rl                  T'Rl                  :w  a  Mo  T(       a  UT$T%-   :  a    OTUS-  nU[        Rn                  :  a    O9T'R?                  U	5         T RA                  U	/U" U5      Q75        SSS5        M     [q        T 5      S:X  a  gSUUUUU U!U"U#U$U%U&U'U 4S jjnU$ U" U5      mU" U5      mU" U5      mSUUUUUU"U U(4S jjnU$ ! , (       d  f       GM  = f! [H         a]  n[J        RM                  [N        RP                  5      (       a)  [J        RS                  ST(       d  SOS[U        U5      5         SnAGM  SnAff = f! , (       d  f       GM  = f! , (       d  f       GM  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7fra   )rI  rb   r  r&   r  r"  s     rj   rk   .Scheduler.speedup_by_fusion.<locals>.<genexpr>  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   r8  r   CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)rb  r,  r-  DEBUGrc  r  r?   r@   )ms_fusedms1ms2r   r   s      rj   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rr   c                   > TR                  U SUS9n[        R                  " U5      nTR                  5       (       d  S nXC4$ TR	                  SUS9n[        U[        5      (       d   eXC4$ )NT)r  r  triton_)kernel_namesource_code)r  r   loaduse_process_poolr   rb   r   )r  r  src_codemodfutasync_compiler  s        rj   compile_kernel3Scheduler.speedup_by_fusion.<locals>.compile_kernel)  s     ;;M < H ""8,C 1133
 : $**yh*W!#|4444:rr   c              3  D   #    U  H  oR                  5       S Lv   M     g 7fra   r	  r"  s     rj   rk   r  8  s      %
7E!!-~s    c                    U S   $ r  r   r0  s    rj   rp  -Scheduler.speedup_by_fusion.<locals>.<lambda>I  s	    aPQdrr   rk  r  infException in compiling %s: %sr  r  Fr   allowed_prologue_inpsc            	       > [        S5      n S n0 nT H  u  p4n Ub  UR                  5       nO'T(       d  UR                  nUR                  5         OS n T(       a=  TR                  U5         TR                  UT5      u  pXU'   X:  a  Un UnS S S 5        M  TU:H  =(       d    TT-   TU   T-   :  n
U(       d  M  [        UR                  5      S:X  d  M  UR                  S   R                  S::  d  M  U
(       d  M  Un  O   T(       a
  T" U TT5        T(       a	  U TT-   :  aP  UbM  [         R"                  (       a  UTS '   TR%                  T5        OTR'                  U5        UTR(                  S '   gg	! [         a]  n[
        R                  [        R                  5      (       a)  [
        R                  ST(       d  SOS[        U5      5         S nAGM  S nAff = f! , (       d  f       GM  = f)
Nr3  r4  r  r  r   r   r  TF)r  r!  r%  
precompiler  rb  r,  r-  r  rc  r   swap_as_triton_callerr  r   	launchersn_spillsr#   r  r  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr  future	mod_fusedresr   r  pathfusible_choicebench_epiloguer  r:  epilogue_fusionfuture_choices hint_override_best_fusion_choicer"  
min_choicer   r!  	ms2_fusedr  r  s              rj   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   $U|"& 1?-FI!!-"(--/C!/"+"3"3CNN,"&C &'==fE-1-L-L ) &.NH
 3;/'6/728 FE '&0 N"Sy>&+AI+MM '  C #CMM 2a 7 #a 0 9 9Q > ..4O!a 2@d "|S#6 ',#)*D%100AP8>"==<
 #<<_M 8CJ..t4 u % !%227==AA&,, ?2A
z #A
 !! FEs/   E7#E7E7-"G!7
GAGG!
G1	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r   r!  r  path1path2
path_fuseds   rj   rp  KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>1  s&    053605365?8@3;sSy3I%rr   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsrM  r!  r  mathisinfr   r  r  r   r  r   )rM  r,  r   r   r!  r  rW  rX  rY  r  r:  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r"  r  r  s      @@@@@@rj   rJ  rK    s   A *!,)!,/2 
 ?JJL  "&!@!@)!,"JC
 zz#CD$!%!@!@)!,"JC
 zz#DE$+/+J+J/2,(Hj
 zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFF)r  r  r   r  r!  r  r   rQ  ra   )r  r  r  Optional[int]r   z)tuple[Optional[LambdaFuture], ModuleType]rR  )9r   r#   benchmark_fusionrI  rb   r  r&   TritonTemplateBufferrO  rn   r   r   r  r   r  r  r  triton.compiler.errorsr  rZ  r  r  r-  AsyncCompiler  r  r  rp  r  r  TritonTemplateCallerr8  r   r  r  r!  r  rb  r,  r-  r  rc  r   r  r;  r   r  benchmark_epilogue_fusionr  r  r  operator
itemgetterr2  r5   max_epilogue_benchmarked_choicesr   ))r  r   r   is_multi_templatenode_list_1node_list_2node_list_fusedr.  r  r  rn  r<  r=  r>  r?  r@  r   r  rB  rX  triton_choicesunfused_timerJ  r  r-  rD  r  r:  rE  r_  r`  ra  rF  rG  r"  rH  r   r!  rI  r  r  s)   ```                    @@@@@@@@@@@@@@@@@@rj   speedup_by_fusionScheduler.speedup_by_fusion  s       
 U^ 
 

 &&/@ u668":Q:QRR!!!! oo'Q**,v ;;%F$6$6($Boo'y{HI
 00;u% #..0!!!	 	" 55BBD PT	.	?L	6	 	  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA  - TVN!'!:!:!+!:!:=!I!'(<(<(>N!SIF% @ @ U U  !#99&A&-- &!/$3CWCW"" BA "T  %U|FJ 1?-FI
!!-"MMO $99&A)-)H)H%v*$ /7+#l2+3L.4O BA 2@( =H
**=9!/3KLLLLBQ0?U ";Z (668N(779OJ,C#==N ' ..{;33K@ U ' 224<UE3O	 TVNN(.$$&H,?,?,B)$ "&%//*<*<*U*UVV ((?@@44
8X8XX!lcCi&?!#!F$K$KK55f="))6*TN?4S*TU >=3)8 >"a'K! K! K!Z (' !/{ ; .{ ;&4_&E#F FP ('q BA" % !%227==AA&,, ?2A
z #A
 !! BAx >=s=    $U7V
,$W4 X7
V

W1AW,,W14
X
X	c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     rj   r`  Scheduler.get_fused_nodeH  s    &&t':':'<==rr   c                .  ^ ^^^ T R                  U5        [        U5      m[        R                  [        R
                  5      (       aB  [        R                  S5        T H'  n[        R                  SUR                  5       5        M)     0 m      SUU 4S jjm      SUUU 4S jjnT R                  X5       H  u  pVU" XV5        T R                  U5      nT R                  U5      nT R                  XVU5      (       d  MH  T R                  XV5      (       a  M`  T R                  XV5      n[        U5      (       a  XuU4TU'   XuU4TU'   M  U(       d  M  T" XV5        M     [        5       nTR                  5        Hx  u  pnX;   a  M  UR                  U	5        T R                  U
5      U
L d   eT R                  U5      UL d   eU	" 5       (       d  MX  T R                  X5      (       a  Mp  T" X5        Mz     [!        TS S9nT R#                  U5      nU$ )	z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  %sc                  > [         R                  SU R                  5       UR                  5       5        U R                  5       nUR                  5       U:X  d   eTR	                  U5      R                  X5      nTR                  U 5        TR                  U5        TR                  U5        TR                  R                  UR                  5        Vs0 s H  oDR                  5       U_M     sn5        U$ s  snf )Nzfusing %s with %s)rb  rc  r  r   r  rS  r  r  r  r  rn   )r   r   r:  node3r#  r  r  s        rj   fuse_two_nodes1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodesf  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@.?u$.?@ L As   C8c                B  > TR                  U 5      T;   d  TR                  U5      T;   a  TR                  TR                  U 5      TR                  TR                  U5      5      5      nUc   eUu  p4nTR                  US 5        TR                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  M  T" XE5        TR                  U 5      T;   a  M  TR                  U5      T;   a  M  g g ra   )r`  r  r  will_fusion_create_cycle)	r   r   pending_fusion
is_speedup	node_key1	node_key2ry  pending_fusionsr  s	         rj   resolve_pending_fusions:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusionsv  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BC" &1113A0
y##It4##It4**95BBB**95BBB!||t'D'DU'R'Ry4' ##E*o=&&u-@rr   c                    U R                   $ ra   rG  r0  s    rj   rp  +Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !++rr   rk  )r   rX   r   rX   r   rX   ri  )r	  r   rb  r,  r-  r  rc  r  get_possible_fusionsr`  r   r|  rr  callabler   r  rp  r  )r  r  r  re   r  r   r   speedupseen_pair_speedup_fnis_speedup_fnr  r  ry  r  r  s   `           @@@rj   r  Scheduler.fuse_nodes_onceL  s     	!!%( '""7==11;<#  )=)=)?@ $  	
	$	->		 	 	5$	5->	5	5 	52 !55eNLE $E1''.E''.E}}. 33EAA00>G$$.5e-DOE*.5e-DOE*u,) O, @J|3B3I3I3K/Mi4 $$]3&&y1Y>>>&&y1Y>>>t'D'D( ( y4 4L {(=>..u5rr   c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr6  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ ra   rG  r0  s    rj   rp  5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s    q{{rr   rk  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   r  rc  r  r  r3  r  speedup_by_combo_kernelr#   r8  r  r  r  r  r  r  rn   r  rp  r  r	  )r  r  r  r  num_nodes_orignumr  r  r>  re   r#  s              rj   r   #Scheduler.create_combo_kernel_nodes  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
R

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g ra   )r	  r  )r  r  re   s      rj   r	  Scheduler.prune_redundant_deps  s     D%%d&=&=> rr   c                  ^ ^^
^ / m
[         [        [        [        4      " 5       mSUU
UU 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nXF   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      n	U	(       d  M  X   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T
5      m
T
R                  T R                  SS9  [         R#                  S[%        T
5      5        T
$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pU US-   US-   [        R                  -     H  nX#4nUT;   a  M  TR                  U5        TR	                  X#T5      (       a  TR                  U5        MI  UR                  5       (       d  UR                  5       (       d  Mu  TR	                  X2T5      (       d  M  TR                  X245        M     M     g r  )r  r#   )max_fusion_buffer_group_pairwise_attemptsr  r   r   rI  rO  )	r  node1_indexr   r   rl  r  possible_fusionsr  r  s	        rj   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6""!Ok'FF'GE
 !.Cd{ HHSM}}U3CDD(//4++--1A1A1C1C&6J J )//?! '7rr   r{   NT)rl  reversezfound %d possible fusionsr  r(  r   rQ  )r   r   rX   r  r   r   unfusable_noder   r   r   r#   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr  score_fusion_keyrb  rc  r   )r  r  r  r  buffer_names_groupingre   r   node_groupinggroup_groupingr{   r  r  s   ` `       @@rj   r  Scheduler.get_possible_fusions  sV    % 13D DEFH	@ 	@( !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLrr   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fra   r  rh   r#  
found_pathr  s     rj   rk   IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>?  s,      H!DA #4#:#:1#=>>!D   "%)rb   r   r  r   issubsetr   r   r   )re   combined_ancestorscombined_namesr  r  visiteds    rj   r  6Scheduler.will_fusion_create_cycle.<locals>.found_path.  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  rr   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7fra   r  r  s     rj   rk   5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>M  s&     WDVqJt66q9::DVr  zwill create cycler   )r   r   r   _dictr   r   r   rZ  )r  r   r   cycler  r  r  r  s   `   @@@@rj   r|  "Scheduler.will_fusion_create_cycle$  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78rr   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two nodes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r  )
r   r   r6  r  r   r   r	  re   has_tensor_outputr   )re   rw  r  r   r  s       rj   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsi  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mrr   c              3  4   >#    U  H  nT" U5      v   M     g 7fra   r   rh   r   r  s     rj   rk   <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>w       #S]c$4S$9$9]r  c              3  4   >#    U  H  nT" U5      v   M     g 7fra   r   r  s     rj   rk   r  x  r  r  r   r   F    T)re   rX   r   zlist[ir.Buffer])rf  r  r   intersectionr   r  r  rS   rv   rw   statically_known_gt)r  r   r   r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadrl  	bw_savingr  s   `           @rj   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memoryR  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                   [        UR                  5        Vs/ s H  oDR                  5       PM     snUR                  5        Vs/ s H  oDR                  5       PM     sn-   5      n[        S UR                  R                   5       5      n[        S UR                  R
                   5       5      nXv-  n[        5       n	UR                  R                   HA  n
U R                  U
R                  U5      (       d  M&  U	R                  U
R                  5        MC     [        S UR                  R
                   5       5      [        S UR                  R
                   5       5      -  n[        S UR                  R                   5       5      [        S UR                  R                   5       5      -  nX-
  nX-
  nX-  n[        U5      U:  $ s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   EScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s     &T;SCxx;Sr  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r    s     %R:Q3hh:Qr  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r    s      $
 7HH 7r  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r    r  r  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r    s      %
 8HH 8r  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r    s     D+CCxx+Cr  )
r   rn   r  r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )r  r   r   	thresholdre   fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr&  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                   rj   (fusion_prevent_too_many_reads_and_writes2Scheduler.fusion_prevent_too_many_reads_and_writes  s    &).):;):]]_):;+0??+<=+<4}}+<=>
 '&T5;L;L;S;S&TT%%R%:K:K:Q:Q%RR'7'K$ :D%**11I88 0  .11)..A	 2 $ $
 % 1 1 7 7$
 
C5+<+<+B+BCCD
 % %
 % 1 1 8 8%
 
D5+<+<+C+CDDE
 &D (G )8$%	11M <=s   GG
c                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heuristic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )r  rx  r^  r_  )r  r   r   proximity_scores       rj   are_long_distant_nodes Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##rr   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: rt   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  r   rS   rv   r  rb   r/   r   r   rR   r   
get_offsetnormalize_with_stride_orderr&   r  r  r   )r  r   r   common_buf_namesreasonsr   node1_name2depnode2_name2depr1  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  rj   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reason  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                .	   [         R                  (       d  g[        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  g[        S UR                   5       5      nXc-
  (       a  g[        U5      S:  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  g[        [        UR                  R                  5      5      n[        [        UR                  R                  5      5      n[        U[        5      (       a  [        U[        5      (       d  gUR                  R                   V	s0 s H  oR                  U	_M     n
n	UR                  U
;  a  gXR                     n[        U[        5      (       d  gUR                  5       nUR                   UR                   :w  a  UR"                  UR"                  :w  a  gUR"                  UR"                  :w  d  [        UR$                  5      S:w  a  g[        UR&                  R(                  5      S:w  a  gUR&                  R*                  (       a  gSUR&                  R(                  ;   a  SUR&                  R(                  ;   d   e[        S UR&                  R-                  5        5       5      n[        U5      S:w  a  g[        [        U5      5      nXR&                  R(                  S   :X  a  SnSnO"XR&                  R(                  S   :X  d   eSnSnS	S
KJn  UR&                  R2                  S	   n[        U5      S:w  a  g/ n[4        R6                  R9                  U5       H;  nUR;                  [<        R>                  R@                  RC                  U5      5        M=     [E        U5      nU" UUS	   5      nUc  gUR&                  R(                  U   UR&                  R(                  U'   UUR&                  R(                  U'   URG                  SS5        U RI                  X5      n[        U[J        5      (       d   e[L        RO                  SU5        U$ s  sn	f )a  
Attempts to enable fusion between two nodes by inverting indexing patterns.

This optimization targets cases where node1 has a contiguous write and
node2 has a contiguous write but discontiguous read. By inverting the
indexing in node2's read and write operations, we can make them compatible
with node1 for potential fusion.

Args:
    node1: First scheduler node (source)
    node2: Second scheduler node (target for inversion)

Returns:
    int: Fusion score if successful, 0 if optimization not applicable
r   c              3  @   #    U  H  oR                  5       v   M     g 7fra   r;  r"  s     rj   rk   AScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>*  s     2>axxzz>r  c              3  8   #    U  H  oR                   v   M     g 7fra   r  r  s     rj   rk   r  6  s      .
 8HH 8r  r   r   index0index1c              3  $   #    U  H  ov   M     g 7fra   r   )rh   r  s     rj   rk   r  q  s     %T7Std7Ss   r   )generate_inverse_formulaTFz!Shared memory after inversion: %d)(r#   r  r   r   buffer_namesr   rk  r   r   r  r  r%  rb   r/   r   r  r   r   rP  r   r   	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr   varsr   Add	make_argsr   rS   rv   rw   combine_modular_indexing_pairsr   r  r  r   rb  r  )r  r   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writer   node1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr   r  simplified_termstermsimplified_read_exprinverse_formulascores                          rj   $shared_data_after_inverting_indexing.Scheduler.shared_data_after_inverting_indexing  s   & 442E>222 #..;;="..;;=0E" $. .
 % 8 8.
 $
  $8'(1, u  &&'!+s53D3D3K3K/Lq/P$u006678
4 1 1 8 89:*i00
9
 9
 161B1B1I1IJ1I##1IJ??,."??3+y11 "++- !2!22  K$4$44??k...#j6J6J2Kq2P u{{))*a/ ;;   222EKK666	
7
 &%Tu{{7Q7Q7S%TT A%./0	 228<<&O' : :8 DDDD&O'Q[[%%a(
z?aII''	2D##  ??E 3  ##3423GTUW "
 7<kk6P6P7
""?3 8G""#34 	""4/((6%%%%%;UCm Ks    Rc                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  5       (       d  UR                  5       (       a  gUR                  R                  5       nUR                  R                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[!        U	["        R$                  " S5      S9u  pn['        U[(        5      (       a  ['        U[(        5      (       d  gUR*                  UR*                  :w  a4  UR-                  5       UR-                  5       :X  a  U R/                  U5      $ gSnUR1                  5       (       d  UR3                  X5      nOZUR1                  5       (       d  UR3                  X5      nO3[4        R7                  SUR9                  5       UR9                  5       5        U(       a*  [:        R<                  " [>        U RA                  X5      5      $ S$ s  snf s  snf )as  
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatible with node1 if that's more efficient.

Return the amount of shared data re-computed in this method.
If no such recomputation happens, return -1 (not return 0 since 0 is a valid
amount of shared data).

c              3  @   #    U  H  oR                  5       v   M     g 7fra   r  r"  s     rj   rk   >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s      8
 .1HHJJr  r   r   r   rk  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)!r#   r  r   rI  r   r  r  r   r  r   rS   rv   rw   r   r   r   r  ri  rj  rb   r/   r  r  dep_size_hintrd   r  r  rc  r  r  r  r   r  )r  r   r   r	  r
  r  r   r  r  
candidatesr   r  r  _numel	reordereds                  rj   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop  s     00C 8
!&8
 5
 5
 
 %"3"3"5"5"..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a $'zx7J7J17M#N '9--Z5S5Sw///
   "g&7&7&99))'22	!!##77II##%%77II##Q    KKT55eCD	
 	
g YXs   6K!*K&c                    [        U[        [        45      =(       a6    UR                  5       (       + =(       a    [	        UR
                  5      (       + $ )z.
Is this node unfusable under any conditions.
)rb   r  rp  rI  rP   re   r  s     rj   r  Scheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
rr   c                   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnU" UR                  5       R                  5      (       a  UR                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)re   r  r  r  )rh   r#  r   s      rj   rk   EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>(  sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )Nr   )itemsizeis_floating_point)r  s    rj   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp5  s    >>Q&B5+B+BBrr   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  ztorch.dtyper   r   )r   rS   rv   invoke_quant_opsr  r  r   rn   r  r?  r@  constant_pad_ndr  r  r  r&  )	r  prologue_noder  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  r/  s	            rj   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHII!>>@@h rr   c                  ^  [        U[        5      (       a  [        U[        5      (       d  g[        UR                  [        R                  5      (       a)  [        UR                  [        R                  5      (       d  gUR                  5       (       d  UR                  5       (       a  g[        R                  S:X  a  gUR                  UR                  pCUu  pVUu  pxUR                  5       (       d2  UR                  5       (       d  Xh:w  d  [        U5      [        U5      :w  a  g[        UR                  R                  5      S:  d#  [        UR                  R                  5      S:  a  gT R                  [        [        UR                  R                  5      5      5      n	T R                  [        [        UR                  R                  5      5      5      n
[!        X5      [        R"                  :  a  gSU 4S jjnU" U5      (       d  U" U5      (       a  g/ n[%        ['        XW5      5       H   u  nu  pX:w  d  M  UR)                  U5        M"     [        U5      S:w  a  gUS   nUU   UU   nn[*        R,                  R.                  R1                  UU5      (       a  UUU4$ [*        R,                  R.                  R1                  UU5      (       a  UUU4$ g)a?  
Fusing two small pointwise nodes significantly reduces kernel overhead
and launch overhead. However, slightly different sizes would prevent fusion.
Here, we decide if expanding sizes of one node is profitible by allowing
fusion, and returns the dimension to expand, node with smaller sizes,
and new size after expand.
Nr  r   c                  > U R                   R                   H  nUR                  TR                  ;   a  TR                  UR                     nO%TR                  R                  UR                  5      nU(       d  Me  [        R                  R                  R                  X 5      (       d  M  [        UR                  [        5      (       a  M    g   g)NTF)r   r   r   r5  r6  r  rS   rv   r0  ro  rb   r  rp  )re   rv  rw  r  s      rj   has_reusable_bufferIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer}  s    ((..99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I I,,66yGG&y'<'<>TUU / rr   r   r   )rb   rc   re   r&   r   r  r#   r  r  rd   r   r   r  r  r  r%  r  small_memory_access_thresholdr  r  r   rS   rv   rw   statically_known_lt)r  r   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr;  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  rj   "get_expand_dim_for_pointwise_nodes,Scheduler.get_expand_dim_for_pointwise_nodesC  sl    %//z%7W7W uzz2#4#4555::r'8'899 ))++u/M/M/O/O ) #\\5<<()1&)1&  !!##1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"7223 	  u%%)<U)C)C !'0]1R'S#C#'!#**3/ (T "#q(*1-,',' ' 77//OO66WW11..QQ66rr   c                  ^ XL a  g[        U[        5      (       a  UR                  U5      $ [        U[        5      (       a  g[        X5      nUR	                  5       (       a4  U R                  UR                  5       5      R                  X5      (       a  g[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR	                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR	                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR	                  5       (       Gac  [        R                  (       d	  U" S5        gUR                  5       (       d  UR	                  5       (       a	  U" S5        gUR!                  5       n[        U["        R$                  5      (       d	  U" S	5        gUR'                  5       n[)        S
 UR*                   5       5      U-
  nUR-                  5       U-  (       a	  U" S5        gUR/                  5       (       d  UR/                  5       (       a	  U" S5        gUR1                  5       mTSS  HK  n	U	R3                  5       n
U
 H2  n[5        U4S jUR6                   5       5      (       a  M)  U" S5            g   MM     [        U[8        5      (       d  U/O2UR:                   Vs/ s H  oR	                  5       (       d  M  UPM     snn[=        U5      S:X  d   eUS   n[=        TS   R>                  5      S:X  aU  [=        TS   R>                  S   R6                  5      S:X  a,  TS   R>                  S   R6                  S   R@                  UL d	  U" S5        gU RC                  XU5      (       d  gUR	                  5       (       aH  UR/                  5       (       d*  UR                  5       (       d  [        RD                  (       d	  U" S5        gUR-                  5       [F        RH                  RJ                  -  (       d0  UR-                  5       [F        RH                  RJ                  -  (       a	  U" S5        gUR                  5       nUR                  5       nUU:w  a  U" SUU5        gAU RM                  XUS9n[        U[N        5      (       d   eU(       aB  U[        RP                  :  a.  [        RR                  (       a  U RU                  X5      nUS:  a  Un[        RV                  (       aX  U RY                  X5      =n(       a@  Uu  nnnUR[                  UU5        U RM                  X5      n[        U[N        5      (       d   e[        R\                  (       a-  U[        RP                  :  a  U R_                  X5      nUS:  a  Un[`        Rc                  [d        Rf                  5      (       a4  [`        Ri                  SURk                  5       URk                  5       U5        [F        Rl                  Ro                  XUU5      (       d  gUR                  5       UR                  -  (       a`  U Rq                  X5      =(       aH    [F        Rl                  Rq                  XUU5      =(       a     U R                  U5      Rq                  X5      $ [F        Rl                  Rs                  XUU5      =(       a     U R                  U5      Rs                  X5      $ s  snf )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  @   #    U  H  oR                  5       v   M     g 7fra   r  )rh   inps     rj   rk   %Scheduler.can_fuse.<locals>.<genexpr>  s     E_c<<>>_r  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr   c              3  @   >#    U  H  oR                   T;   v   M     g 7fra   ro   )rh   r"  prologue_nodess     rj   rk   rS    s     QytyyN:ys   z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r  z%s and %s has %s shared data):rb   r  r  rZ  rI  r  r   can_fuse_multi_outputs_templaterL  r  rp  r   r   r#   prologue_fusionrd   r  r&   rd  get_allowed_prologue_inpsr   r6  r  r  rn   rv  rm   r	  r   r   r   rd  re   r7  rE  rS   rv   no_fuse_buffer_namesr  r   score_fusion_memory_thresholdr  r#  $expand_dimension_for_pointwise_nodesrN  r  r  r  r  r,  r-  r  rc  r  choicesr   can_fuse_verticalcan_fuse_horizontal)r  r   r   can_reorderr  r  r
  r5  unsupported_prologue_argsre   	node_outsr   r#  template_snodestemplate_snoder:  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerU  s                          @rj   r   Scheduler.can_fuse  s.    >e455&&u--e455 %4#3#3$

)
)%
7$8 e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-))01!!##u'8'8':':HI779Hh(?(?@@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 444M 5 
 +S1111 !F$H$HH11$($J$J5$X!$)$9!66#FFuTTOT6E3Z{<<ZU $ 8 8 F/5555 11!F$H$HH$($M$M%! %)$9!))'--88##.  !	 yy!!$u6GHH$$&8 &&u4 MII//UDUVM$$V,>>uL 9900U$5 M""6*>>uLMs Bs   ]/]c                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        S [$        R&                  R)                  UR+                  5       5       5       5      nX-  (       a	  U" S5        gUR-                  5       nU HJ  nU R.                  U   R1                  5       nXR2                  U   R4                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7fra   r  r  s     rj   rk   .Scheduler.can_fuse_vertical.<locals>.<genexpr>{  s      $
U HHUr  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  rZ  r   r   rk  rb  r  r   rb   r1   r  r   r   r  r/   fusable_read_and_writer  r   r  r  r	  r   r   r6  r  r  r   )r  r   r   node1_buf_namesr  remaining_deps_by_namer   r   cd	remainingr  remaining_depsnode1_op_namesr  s                 rj   r]  Scheduler.can_fuse_vertical\  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # rr   c                  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       a  g[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nU/n[        U[        5      (       a  UR                  nSnU He  n	U	R                  R                    V
s/ s H  n
U
R                   U:X  d  M  U
PM     nn
U(       d  MD  US-  n[#        U4S jU 5       5      (       a  Me    g   US:*  $ s  snf s  sn
f )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7fra   )rb   r/   r   r   r    TMPr   )rh   rv  rR  s     rj   rk   -Scheduler.fusable_weak_dep.<locals>.<genexpr>  sn      
 +D	 4+ ,+DJJAA,JJ%++-, II+, +s   BB)r   r  r   r  rZ  r   rb   r0   r/   r   r   r    ry  r  r  r   r   rm   )r  weak_depr   r   rR  mutating_writesr
  relevant_reading_nodesnum_concurrent_readsreading_noderv  relevant_readss       `       rj   r  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"eW%%%++++u{{DHH55++H,A,AB	"'e788%*\\" 2L )44:::D99	) :  
 " A%  
 +   ! 3" $q((K
*s   E>E>+FFc                8   [        U[        5      (       Gab  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nUR                  UR                  :H  =(       aa    [        UR                  5      [        UR                  5      :  =(       a/    UR                  S [        UR                  5       UR                  :H  $ [        U[        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                   UR                   :X  a  UR                   b  X4:X  a  ggr=  )rb   r/   rb  r  r   r   r   r    ry  r#   r  r  r  r   r   r0   r+  )r  rv  rR  	read_name
write_names        rj   ro   Scheduler.fusable_read_and_write  sh   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rr   c                @    [         R                  R                  X5      $ ra   )rS   rv   get_dep_size_hint)r  r   r  s      rj   r  Scheduler.dep_size_hint  s    ww((::rr   c                  ^ ^^ U4S jnU(       a8  [         R                  X5      (       a  [         R                  X5      nU" US5      $ [        UR                  R
                  5      [        UR                  R                  5      -   n[        UR                  R
                  5      [        UR                  R                  5      -   n	[        X5      S-  [        X5      :  a  X:  a  X!p!UR                  R
                  UR                  R                  -   V
s/ s H9  n
XR                  R
                  ;   d  XR                  R                  ;   d  M7  U
PM;     nn
U" [        UU 4S jU 5       5      S5      $ UR                  R
                  UR                  R                  -  UR                  R
                  UR                  R                  -  -  nU" [        U 4S jU 5       5      S5      $ s  sn
f )zV
The first term in our fusion score that estimates number of saved
memory operations.
c                   > T(       a  X4$ U $ ra   r   )r  is_mix_order_reductionreturn_is_mix_order_reductions     rj   _construct_return_value>Scheduler.score_fusion_memory.<locals>._construct_return_value  s     1 / rr   Trm  c              3  H   >#    U  H  nTR                  UT5      v   M     g 7fra   r  )rh   r   r  r  s     rj   rk   0Scheduler.score_fusion_memory.<locals>.<genexpr>  s!     IDSD&&sK88D   "Fc              3  F   >#    U  H  nTR                  U5      v   M     g 7fra   r  r  s     rj   rk   r    s!     F3EC""3''3Er  )
r]   r   r   r   r   r   r  r  r  r   )r  r   r   r  r  r  r  r  node1_dep_lennode2_dep_lenr   r  common_memory_depss   `  ``        rj   r  Scheduler.score_fusion_memory  s   	 %):)C)CE)Q)Q
 &66uDE*5$77E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT,$u !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   +IDII5  $//558I8I8P8PP##e&7&7&>&>>
 'F3EFF
 	
s   6G.G.c                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   rk  r   )
r   r   r   r  get_fusion_pair_priorityr   r  r  ri  rj  )r  r  "possible_fusions_group_by_priorityr   r   r:  fusion_pair_priority&possible_fusions_with_highest_prioritys           rj   r  4Scheduler.get_possible_fusions_with_highest_priority  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55rr   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rS   r\  score_fusionr  s     rj   r  Scheduler.score_fusion_key>  s     yy%%d3U33rr   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rS   rv   r  r   r  r  r  r  r]  )r  r  re   s      rj   r  Scheduler.compute_last_usageF  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )rr   c                   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  [        U[        R                  5      (       a  GM  UR                   n[        U[        R"                  5      (       a  UR%                  5       (       d   e[        R                  R
                  R                  UR                   5        GM     U R                  R'                  5         g)z*Free any buffers that are no longer neededN)rp  r  rS   rv   rn  r0  freedr6  r@  codegen_freere   r  rb   r&   r  r/  r   r  is_input_bufferclear)r  r   r   rR  storages        rj   free_buffersScheduler.free_buffersQ  sU   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:R%6%677!hhG"7BMM::w?V?V?X?XXGG((55gllC)
, 	!!'')rr   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g ra   )r  r   flushr  )r  r  s     rj   r  Scheduler.flushk  s.    }}++-GMMO .rr   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)Nr  extern_callsr   F)increase_kernel_countztype(node)=)rb   r  r   rS   set_kernel_handlerr+   rz  r  re   r&   rD  r   rj  rv   r0  r  )r  scheduler_nodere   s      rj   codegen_extern_callScheduler.codegen_extern_callp  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: r      r   )rN   r   r   rS   rv   add_device_infor*   r  r!   r  r   get_device_propertiesmajorr2   inspectcurrentframer3   )r  r:  device_schedulingdevice_propss       rj   create_backendScheduler.create_backend  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&rr   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ ra   )r  r  r9  s     rj   r  Scheduler.get_backend  s@    !!!&$($7$7$?DMM&!}}V$$rr   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf ra   )r  r  r  rv   r  )r#  r  r  s     rj   	get_order*Scheduler.enter_context.<locals>.get_order  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   rk  )r#  ztorch.fx.Noder   r   )rn   re   r  r   r   r  ri  rj  rS   rv   r0  enter_context)r  re   r  r#  r   r  rn  lasts   `       rj   r  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7fra   )r  r  )rh   r"  r  s     rj   rk   AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUr  )r6  r	  KeyErrorrm   rb  r  )r  r   r  r	  s     ` rj   r  .Scheduler.can_buffer_be_removed_through_fusion  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                   UR                   n[        U[        R                  R                  R
                  5      (       a  UR                  =n(       a  UR                  5       n[        U[        R                  R                  5      (       a  U SUR                   3OUnU[        R                  ;   d  U[        R                  ;   a0  [        U[        R                  R                  5      (       d   eSU 3$ [        R                  R                  R                  R                  (       d  [        R                   c  g[        U["        5      (       a0  UR$                   H  nU R'                  U5      nU(       d  M  Us  $    gUR                   c   eUR)                  5       (       d  UR+                  5        S3$ [        UR                   [        R,                  5      (       a  g[        UR                   [        R.                  5      (       a  g[1        UR                   SS5      (       a  g	[3        UR                   5      (       a  g
[        R                  R4                  (       a  [7        U5      (       a  gg)zr
Return the reason why we should partition the inductor graph on this node,
or None if the node is cudagraphable.
r  zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)re   rb   r  r  r&   rr  r  r   _ops
OpOverload_overloadnamer#   custom_should_partition_opsr   rW   rB   wrapperr   r   should_partitionrN   r   
DeviceCopyConditionalr  rM   cudagraph_skip_dynamic_graphsr  )r  re   r  r  op_overload_packet_nameop_overload_namer;  r]  s           rj   r  Scheduler.should_partition  s    ))gu11@@AA%%%B%&(ggi# b%**"7"788 ++1R-=-=,>?,  (6+M+MM#v'I'II!"ejj&;&;<<<<./?.@AA &&--886>>FKd.//..u56!M % yy$$${{}}oo'(--dii//#dii00$4991488)!$)),,0 ==66-d33*rr   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rS   rv   r  r  rf  r  re   )r  r  re   r   scheduler_buffers        rj   get_name_to_nodesScheduler.get_name_to_nodes  sd     UWAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  rr   c           	        [        [        R                  R                  5       VVs0 s H  u  p#X2_M	     nnn[        [        R                  R	                  5       5       VVs0 s H  u  p#X2_M	     nnn/ [        R                  l        [        U5       H  u  pgUR                  (       a  M  / nUR                   H#  nUR                  UR                  U5      5        M%     / n	UR                   H1  n
U	R                  UR                  U
R                  5       5      5        M3     [        R                  R
                  R                  [        UUU	UR                  5      5        M     gs  snnf s  snnf )zj
computes a mapping from partition input/output indices to graph input/output
indices for each partition.
N)r  rS   rv   r  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rJ   constant_names)r  
signaturesrH  r   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingre   s              rj   compute_graph_partition_maps&Scheduler.compute_graph_partition_maps
  s;    (11E1E'F%
'F)#DI'F 	" %
 (11I1I1K'L&
'L)#DI'L 	# &
 "$'0'<#L''
 M!--$$%>%B%B4%HI .  N!..%%&@&D&DT]]_&UV / GG""))! !",,	! (=%
&
s   E'"E-c                  ^     SS jm    SS jn[        5       R                  " S U 5       6 nUR                  " U4S jUR                  5        5       6   U" U5      n[        5       nU HG  n[        R                  R
                  R                  U5      nUR                  UR                  5        MI     [        [        U[        R                  " S5      S95      $ )	a9  
Returns all symbol inputs which are required to be in scope to successfully
perform codegen for this graph partition, including:
- free symbols used in partition nodes
- free symbols in partition input/node shapes, strides, and offsets. This is needed
  for recording cudagraphs for tensors with dynamic shapes.
c                    [        U [        R                  5      (       a
  [        5       $ [        U [        R                  5      (       a  [        U 5      $ [        S[        U 5       35      e)z?
Gets symbols used in input node shapes, strides, and offsets.
zUnsupported input node type: )rb   r&   r  r   r  r  r  r   ro   s    rj   get_input_node_symbolsKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols@  sT     $ 2 233!|#D")),,)$// *,I$t**VWWrr   c                &    [        S U  5       5      $ )z
Filters a set of symbols that are required for codegen. Skip symbols
that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
and SymT.R0_INDEX.
c              3     #    U  HV  n[        U[        R                  [        R                  [        R                  [        R
                  45      (       d  MR  Uv   MX     g 7fra   )r   r    SIZEFLOATUNBACKED_INTUNBACKED_FLOATrh   r  s     rj   rk   VScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>X  sI       A!		

))++	  s   AA 	A r   )symbolss    rj   filter_symbolsCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbolsP  s         rr   c              3  8   #    U  H  n[        U5      v   M     g 7fra   r  r  s     rj   rk   >Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>g  s     Iyt,T22yr  c              3  8   >#    U  H  u  pT" U5      v   M     g 7fra   r   )rh   rn  re   r  s      rj   rk   r  j  s     N:Mwq$T**:Ms   r   rk  )re   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   OrderedSet[sympy.Symbol])r  r	  r   r	  )r   rC  r  rS   rv   rw   rx   r  r   rp  ri  
attrgetter)	r  	partitionr  r  candidate_symbolsrA  r  symplified_sr  s	           @rj   !get_graph_partition_symbol_inputs+Scheduler.get_graph_partition_symbol_inputs3  s    	XB	X%	X 	-	%	, 7Al6H6HIyI7
 	N+:K:K:MN	
 ++<=(2"A77++44Q7LJJ|001 #
 &(*=*=f*EFGGrr   c           	       ^ ^ / n[        [        R                  R                  5       5      nT R	                  5       nSUU 4S jjm[        [        U5      [        U5      5       GHl  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H(  nT" UR"                  5      (       a  M  UR"                  PM*     sn5      U-
  n[        U 4S jU 5       5      n[        5       nU H  n	UR                  U	R$                  5        M      X-
   Vs/ s H  nX;   d  M
  UPM     nnUR                  U5        U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   _M     nnU Vs/ s H  nX;   d  M
  X;  d  M  UPM     nnU
R                  U5        [        U 4S jU
 5       5      n
U
 Vs/ s H  nT" U5      (       a  M  X_   PM     nnU Vs/ s H$  o[        R                  R&                  ;   d  M"  UPM&     nnT R)                  UU5      n[+        UUUUUU5      nUR-                  U5        UR/                  XJ-
  5      nGMo     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf s  snf s  snf s  snf )z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
c                B  > TR                   R                  U S5      nUc  g[        UR                  R                  [
        5      (       aU  [        UR                  [        R                  5      (       a+  TR                  R                  U S5      =n(       a  T" U5      $ gg)z
Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
so graph partition should not take it as inputs or outputs.
NFT)	r6  r  rb   re   r  r:   r&   MutationOutputr  )r1  r   r
  is_none_layoutr  s      rj   r	  ?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  s    
 ""&&x6C{#((//:66chh(9(9::!%!8!8!<!<Xt!LLIL))44rr   c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fra   r  r  rh   r   r  s     rj   rk   :Scheduler.get_graph_partition_signature.<locals>.<genexpr>  ,      /1D ''++D771   (+c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7fra   r	  r	  s     rj   rk   r	    r	  r	  Nr   )r1  r   r   r   )r   rS   rv   r  r  r  r   r  rf  r   r  r%   rA  rB  r   r   r  r   r]  r  r	  r7   r   rC  )r  
partitionsskip_cudagraphsr  unmet_output_namesr  r	  r  output_namesre   returned_output_namesr   rx  partition_input_namesr  r   extra_input_namesr  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturer	  s   `                       @rj   get_graph_partition_signature'Scheduler.get_graph_partition_signaturew  s\    
'(@(@(BC--/	 	( *-Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K  "-!2!2[5G5G!G!GA-aff5 !G  " %/ /1/ %!
 5?L !$++DOO< " 2@!@D' @  !
 "(():; 21D' )4((1   2"1D' 32221  " 2"1D' ,0,L 1  " "(();<$. /1/ %! 21D%d+ #"1   "7!6!''BSBS:S!6   !BB;M #:"# 12!6!<!<":"K*
R $B$y 9*!
""s`   K
K!
.K!
	K&K&;	K+	K+	K0$	K03	K5 K5K59K:K:!K?K?c                   UR                   R                  5        VVs0 s H'  u  p#U[        R                  R                  ;  d  M%  X#_M)     nnnUR
                  R                  5        VVs0 s H'  u  p%U[        R                  R                  ;  d  M%  X%_M)     nnnUR                   Vs/ s H3  nUR                  5       [        R                  R                  ;  d  M1  UPM5     nnUR                   Vs/ s H%  nU[        R                  R                  ;  d  M#  UPM'     n	n[        UR                  UUUUR                  U	5      $ s  snnf s  snnf s  snf s  snf )z
Updates the partition signature by removing buffers specified in
V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
)r  r  rS   rv   rn  r	  r  maybe_get_namer  r7   r	  r  )
r  r  r   r  r  rl  r	  re   r  r  s
             rj   .clean_removed_buffer_from_partition_signatures8Scheduler.clean_removed_buffer_from_partition_signatures  sR    !* 5 5 ; ; =
 =177222 DL = 	 
 '99??A
A	177222 DIA 	 
 "..
.""$AGG,C,CC . 	 
 "00
0177222 0 	 

 '##$$
 	
)






s/   $EE,$EE+0EE5"EEc                  ^ ^^	^
^^^ SSK m	[        5       m/ m/ m[        U5       VVs0 s H  u  p#X2_M	     snnmSUU	UUU 4S jjm
SU
U4S jjnU H8  n[        UR                  R
                  5      TU'   TU   S:X  d  M0  T
" U5        M:     / nSnU[        U5      :  a  T(       d  T(       a  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  US-  nU[        U5      :  a  T(       a  M  T(       a  M  U[        U5      :  a  [        S5      eU$ s  snnf )ad  
Reorder nodes to minimize the number of partitions via a bfs
topological sort. This is the optimal reordering such that the
number of partitions cannot be reduced further. This may be
sub-optimal for other metrics such as peak memory. This does not
change relative orders of two cudagraphable nodes, nor the
relative order of two non_cudagraphable nodes.
r   Nc                   > TU    U 4nTR                  U 5      (       a  TR                  TU5        g TR                  TU5        g ra   )r  heappush)re   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesr  s     rj   insert_pending_nodesHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes9  sA    ,T2D9O$$T**6H2ODrr   c                   > U R                   R                   H.  nTU   S:  d   eTU==   S-  ss'   TU   S:X  d  M&  T" U5        M0     g )Nr   r   )r`  
succ_nodes)re   	succ_noder-	  node_to_indegrees     rj   update_indegreeCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree@  sO    !]]55	'	2Q666 +q0+#I.!3(3	 6rr   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                re   rX   r   rQ  )	r*	  r  r  r   r`  
pred_nodesheappopr   r  )r  r  rH  re   r3	  r  	num_itersrn  r)	  r*	  r-	  r2	  r+	  r,	  s   `       @@@@@@rj    reorder_for_minimizing_partition*Scheduler.reorder_for_minimizing_partition&  s_    	9=CEGI4=e4DE4Dys4DE	E 	E	4 	4 D%()A)A%BT"%*$T* 
 -/	#e*$#':)--(?@%% *)
 &--(;<%% &%
 NI #e*$##':': s5z!  ] Fs   E(c           	     R   SSK JnJn  [        [        R
                  R                  5       5      nU" UU R                  U R                  [        [        R
                  R                  R                  5       5      U5      u  pVU R                  U5      nU" XvU5      u  pXS-  :  a  U$ U$ )z`
Reorder nodes to minimize the number of partitions if this only slightly
increase peak memory.
r   )estimate_peak_memoryprepare_planning_infor(  )r  r<	  r=	  r   rS   rv   r  r6  r  r  r   r9	  )
r  r  r<	  r=	  ra  default_peak_memoryr  reordered_nodesreorder_peak_memoryrn  s
             rj   r  0Scheduler.maybe_reorder_for_minimizing_partitiong  s     	H"177#;#;#=>:O##qww++0023;
7 ??F!5"

 s!::""rr   c                4   / n/ n/ nSS jnU H  nU R                  U5      SLnU(       a,  [        UR                  5      S:X  a  UR                  U5        MI  U(       a   U" U5      (       a  UR                  U5        Mp  UR                  U5        M     X#-   U-   $ )z
Reorder a node if it should be partitioned and has simple dependency:
1. move a partitioned node to the front if it has no dependency
2. move a partitioned node to the back if it is only used by OutputNode
3. otherwise do not reorder
c                    U R                  5        H8  nUR                   H%  n[        UR                  [        5      (       a  M$      g   M:     gr=  )rv  r	  rb   re   r>  )re   r   r?  s      rj   only_output_userPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user  s<    '')99C%chh
;;$ % * rr   Nr   r   )r  r   rk  r   )r  r  frontmiddlebackrD	  re   r  s           rj   r  6Scheduler.reorder_for_partition_with_simple_dependency  s     *,*,(*	 D#44T:$FC(?(?$@A$ET"!&6t&<&<D!d#  ~$$rr   c                   / nSn/ n/ nU R                    HY  nU R                  U5      SLnU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        M[     U(       a"  UR                  U5        UR                  U5        U R                  XS9nU R	                  U5        U R                  X5        X4$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
TN)r	  r	  )r  r  r   r	  r  _log_graph_partitions)r  r	  r  cur_partitionr	  re   node_should_partitionr  s           rj   r	  Scheduler.graph_partition  s     +-
')JJD$($9$9$$?t$K!!H!!-0&&~6 "2N  &  m,"">277! 8 

 	))*5""::%%rr   c                   [         R                  [        R                  5      (       d  g [	        S [
        R                  R                   5       5      nU(       d  g [        S U 5       5      n[        U5      U-
  n[         R                  S[        U5      UU5        [        [        X5      5       H  u  nu  px[         R                  SU[        U5      UR                  (       a  SOS[        UR                  5      [        UR                  5      5        UR                  (       d  Mw  U H  n	U R!                  U	5        M     M     g )Nc              3  8   #    U  H  n[        U5      v   M     g 7fra   )rN   )rh   r:  s     rj   rk   2Scheduler._log_graph_partitions.<locals>.<genexpr>  s     O:NVF^^:Nr  c              3  J   #    U  H  oR                   (       a  M  S v   M     g7f)r   N)r  r  s     rj   rk   rQ	    s     !PZ?O?O!!Zs   #	#zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)cudagraphs_logr,  r-  r  r   rS   rv   device_typesr   r   rc  r  r  r  r  r  _log_non_cudagraphable_node)
r  r	  r  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  r	  r  re   s
             rj   rK	  Scheduler._log_graph_partitions  s   
 **7==99 O!'':N:NOO!!PZ!PP"%j/4G"GQ
O#		
 *33z3N)O%A%	  EI'0'?'?#_I))*I**+ '''%D44T: & *Prr   c                   U R                  U5      nU(       d  gUR                  5       nUR                  b  UR                  R                  5       OSnSU 3/n[	        UR                  5      R
                  nUR                  SU 35        UbF  UR                   SSR                  S UR                   5       5       S3nUR                  SU 35        [        R                  S	USR                  U5      5        Uba  UR                  R                  S
S5      nU(       a=  UR                  5       R                  S5       H  n	[        R                  SU	5        M     ggg)z)Log details for a non-cudagraphable node.Nzreason=zir=r  r  c              3  8   #    U  H  n[        U5      v   M     g 7fra   )r   )rh   r4  s     rj   rk   8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s     2P<a3q66<r  r{  zfx=z
    %s: %sr  r  z         %s)r  r  re   r  r   r   r   r  r  r*  rT	  rc  r  r  stripsplit)
r  re   r]  r  r  partsir_typefx_strr  lines
             rj   rV	  %Scheduler._log_non_cudagraphable_node  s,   &&t,MMO	151F$))++-D6(#$tyy/**s7)_%'q2P7<<2P)P(QQRSFLL3vh(\9dii6FG !,,**=$?K'--/55d;D"((= <  rr   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r  r  r#   r	  _codegen_partitions_codegenr  r  s    rj   rj  Scheduler.codegen  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                l   SSK Jn  [        R                  R                  n[        U R                  5      n[        R                  R                  5          [        R                  R                  SSU 3UUS9  U R                  U5        [        [        R                  R                  U5      (       d   eU R                  U5      nU[        R                  R                  l        [        R                  R                  R                  5         [        R                  R                  n[        R                  R                  R                  [        R                  R                   5      u  pxSSS5        [        R                  R                  R#                  WW5        [        R                  R                  R%                  XR5        [        R                  R                  R&                  R)                  UR*                   V	s/ s H  oR-                  5       PM     sn	5        g! , (       d  f       N= fs  sn	f )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)rf  rj	  rS   rv   r0  r  r  set_current_wrapper_codeinit_wrapper_coderg	  rb   r#	  ro	  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  r  r  )
r  r	  r  rj	  rn	  graph_partition_id
graph_namepartition_codern  re   s
             rj   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapper  s    	Bgg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQQKKIVI8AAGG  5GG  --/J ! 4 4 = =agg>R>R SN/ 02 	
88^T	334FR	&&--)2)?)?@)?]]_)?@	
9 0/: As   DH ?H1 
H.c                P   ^ ^^ [         R                  SUU U4S jj5       nU" 5       $ )Nc               3    >#    TR                  T T5        TR                  (       a  [        TR                  R                  5      (       a[  TR                  R                  c   S5       e[
        R                  R                  R                  TR                  R                  5         S v   TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        g ! TR                  (       aL  [        TR                  R                  5      (       a(  [
        R                  R                  R                  5         S Tl        f = f7f)Ndevice should have an index)
%update_graph_partition_default_devicer  rE   r   r   rS   rv   r0  codegen_device_guard_entercodegen_device_guard_exit)r	  r  r  s   rj   ctx1Scheduler.use_default_device_context.<locals>.ctx:  s    66z:N**/@++000 0 2288D 1D $$??//553..3D//444 4 GG((BBD.2+	 ..3D//444 4 GG((BBD.2+s    B#E9'D +A%E9A&E66E9)r   zIterator[None])
contextlibcontextmanager)r  r	  r  r	  s   ``` rj   use_default_device_context$Scheduler.use_default_device_context7  s+     
	"	"	3 	3 
#	3* urr   c                N   [        U5      S:X  a  US   R                  (       d  g SS jn      SS jnS n[        X5       H   u  pgUR                  (       a  M  U" U5      n  O   Uc  g [        X5       H'  u  pgUR                  (       d  M  U" Xe5      (       a  M'    g    XPl        g )Nr   r   c                6    U S   R                  5       nUc   eU$ r   r   )r	  partition_devices     rj   get_cudagraph_partition_deviceWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_devicea  s'    (|668#///##rr   c                D    U  H  nUR                  5       nX1:w  d  M    g   gr=  r	  )r	  target_devicere   r:  s       rj   all_on_target_deviceMScheduler.update_graph_partition_default_device.<locals>.all_on_target_devicef  s(     "**  " rr   )r	  rY   r   r  )r	  rY   r	  r  r   r   )r   r  r  r  )r  r	  r  r	  r	  cudagraph_partition_devicer	  r  s           rj   r	  /Scheduler.update_graph_partition_default_deviceR  s     z?a
1(D(D 	$
	$	5A		 &*"$'
$? I+++-KI-V* %@ &-$'
$? I'''0D1 1 	 %@ 'A#rr   c                   U R                  5       u  p[        U5      S:  a  [        S   S==   [        U5      -  ss'   U R                  X5         [	        X5       H\  u  p4[        U5      S:  d   S[        U5       35       eUR
                  (       a  U R                  U5        MK  U R                  X45        M^     SSS5        [        U R                  5      n[        R                  R                  R                  U5        US:  as  [        R                  R                  c   eU[        [        R                  R                  5      :X  d.   SU S[        [        R                  R                  5       35       egg! , (       d  f       N= f)	z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r	  r   r   r	  r  r  rg	  r{	  r  r  rS   rv   r0  set_all_partition_namesr  )r  r	  r  r	  r  num_partitionss         rj   rf	  Scheduler._codegen_partitions  sO    "&!5!5!7
z?QZ !78C
OK8,,ZD(+J(C$	9~* KCPYNK[\* ++MM),33II )D E d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@  EDs   A,E<<
F
c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     U R                  U l        U R                   b   eU R                  (       aG  [         R"                  R$                  (       a(  [&        R(                  R*                  R-                  5         U GH  n[.        R1                  [2        R4                  5      (       a4   [.        R7                  SUR9                  5       UR;                  5       5        U R?                  U5        URA                  5       =n(       Ga  XR                  :w  d*  URC                  5       (       d  URE                  5       (       a  U RG                  5         XR                  :w  a  U R                  (       aL  [I        U R                  RJ                  5      (       a(  [&        R(                  R*                  RM                  5         Xl        [I        URJ                  5      (       aG  URN                  c   S5       e[&        R(                  R*                  RQ                  URN                  5        Xpl)        U RT                  RW                  URX                  5        URE                  5       (       aN  UR[                  []        UR_                  5       5      5      u  pnU Ra                  U5      Rc                  XU	5        GOhURC                  5       (       a.  [d        Rf                  " [h        U5      nU Rk                  U5        GO%URm                  5       (       aw  [d        Rf                  " [n        U5      nU Ra                  U5      nS	S
K8J9n  S	SK:J;n  [y        XU45      (       a  UnO[{        S[K        U 5      < 35      eUR}                  U5        O[y        U[~        5      (       a!  U Ra                  U5      R                  U5        Oc[y        U[        [        45      (       a!  U Ra                  U5      R                  U5        O'[y        U[        5      (       d   eUR                  5         [         R"                  R                  (       a  U Ra                  U5      R                  5         U R                  RW                  UR                  5       5        U R                  RW                  UR                  5       5        [y        U[        5      (       dW  URA                  5       nUbD  URJ                  S:w  a4  U Ra                  U5      R                  5       (       a  U RG                  5         [        S UR_                  5        5       5      (       a	  Xpl        GM  S U l        GM     U R                  U R                  :w  a[  U R                  c   e[I        U R                  RJ                  5      (       a(  [&        R(                  R*                  RM                  5         S U l        U RG                  5         g ! [<         a(    [.        R7                  SUR9                  5       5         GN/f = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r	  r   )CUDACombinedSchedulingr  ztype(self)=r  c              3  B   #    U  H  n[        U[        5      v   M     g 7fra   )rb   rc   r"  s     rj   rk   %Scheduler._codegen.<locals>.<genexpr>  s     J9IA:a//9Is   )Nr#   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  r  r   r  r   autotune_at_compile_timerS   rv   r0  write_get_raw_stream_headerr  r,  r-  r  rc  r  r  r  r  r   rL  rI  r  rE   r   r	  r   r	  r  r  r  r]  r  r   rn   r  codegen_templater  r  r  r  rO  r   codegen.cuda_combined_schedulingr	  r  r  rb   r  codegen_combo_kernelr  codegen_mix_order_reductionr   rc   codegen_noderp  r  debug_sync_kernelcodegen_syncr  r  rm  r   ready_to_flushrm   )r  r  r  stackr  framerl  re   r:  r  r  r  backend_r	  r  r  s                   rj   rg	  Scheduler._codegen  s   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #99!!))) &&6==+Q+QGG  <<>D..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU $%%,,T__=!!484W4W)*51   (99!X !!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D"9::  (DDTJD#5}"EFF  (55d;!$(>???? }}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*&v-((0??AAJJLJ9IJJJ%)"%)"s v $"="== &&222 !4!4!9!9:: $$>>@!

} ! IIPs   3Z33.[%$[%c                    US   R                  5       nU [        R                  l        X l        Uc   eU R                  U5      nUR                  U5      $ )r  r   )r   rS   rv   r  r   r  benchmark_combo_kernel)r  r  r:  r  s       rj   r	   Scheduler.benchmark_combo_kernel%  sU     1((* $!!!""6*--i88rr   c                N  ^ UnUS   R                  5       m[        U4S jU 5       5      (       d   S5       e[        R                  (       d  gSSKJn  S/ pT[        U5       H  u  pgUR                  5       nU R                  U5      (       a  [        R                  S5         U R                  U5      u  p[        R                  " U	5      (       a  [        R                  SU5          g	 XI-  nUR                  U
5        M      U R                  U5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  XL:  d  U(       a$  [        R                  S['        XL-  S 5      5        O#[        R                  S[)        XL-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r  r   c              3  H   >#    U  H  oR                  5       T:H  v   M     g 7fra   r	  )rh   re   r:  s     rj   rk   4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr><  s     K?4??$.?r  z<All nodes in a combo kernel group must be on the same deviceTr  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr[  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   rm   r#   r	  re  r  r  rn   r  rb  rc  r  r]  r^  r   r   r,  r-  r  r?   r@   )r  r  subkernel_nodesr  r   
path1_listr  r;  r  r  rB  r   r!  	ms2_clone_path2_listsmall_kernelr:  s                   @rj   r  !Scheduler.speedup_by_combo_kernel3  s      #..0K?KKK 	
J	
K ,,;rZ!/2HA)I ##I..  R55i@::b>>$$U ! " ICd#7 3:
	*.*E*Eo*V'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44M $ *c!f4$$]     	&#a&0  Y 	s=   #AF6G/ 6G,<$G'&G''G,/H$5$HHH$c                r    U R                   U   nUR                  c   eUR                  R                  5       $ ra   )r6  re   
get_layout)r  r1  r   s      rj   get_buffer_layoutScheduler.get_buffer_layout  s5    x(xx###xx""$$rr   c                   U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        [        45      (       a  Mw  UR                  5       / :X  d  M  [        R
                  R                  R!                  UR                  5        M     M     g r7  )r  rN   r   r   rS   rv   r  r  r   r6   rb   r  r:   r9   r  zero_dim_cpu_tensor_listr  )r  re   rv  r  s       rj   r  $Scheduler.update_zero_dim_cpu_tensor  s    JJD{{}} ,,22DWW3377		BF+F3u< *"MMJ8I+J! ! #OO-388<<TYYG 3 rr   )r  r  r  r  rm  r   r  r  r  r  rb  r6  r5  r  r  r  r  r  r  r  r  )r  zlist[ir.Operation]r   rQ  )r   z!dict[str, SchedulerDonatedBuffer]rT  )r:  rU  r   rQ  rP  )r/  r   r   rQ  )re   r  r   rX   rG  )r;  rX   r   r(  )r   rH  r  r  r   tuple[float, str]ra   r  r  r  r   r  rb  r   r   )r  r   r:  r  r   r	  )
r  ir.OperationBufferr  ir.MultiTemplateBufferr  r   re   rc   r   rQ  )r  r  r   r   )r   rX   r   rX   r   zUnion[bool, Callable[[], bool]])re   rX   r   rX   )r  r(  r  r   r   r(  )r  rb  r   rQ  r  )r  r(  r  r   r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r   )r   rX   r   rX   r  r   r   r   )r   rX   r   rX   r  z'Union[tuple[str, ...], OrderedSet[str]]r   r   r   r   )r3  rX   r  rX   r  rZ  r   r   )r   rX   r   rX   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])FT)
r   rX   r   rX   r_  r   r  r   r   r   )r{  r1   r   rX   r   rX   r   r   )rv  r.   rR  r/   r   r   r$  )r   r.   r  r   r   r   )TFT)r   rX   r   rX   r  r   r  r   r  r   r   zint | tuple[int, bool])r  r	  r   r	  )r  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r  r   rQ  )r:  r  r   BaseScheduling)r:  rU  r   r	  r5	  )r   r   r  r\  r   r   )re   rX   r   Optional[str])r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   rQ  )r	  rY   r  r	  r   r	  )r	  list[PartitionType]r	  z
list[bool]r   r	  )r  r7   r   r7   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r	  r	  r  r	  r   rQ  )r	  rY   r  r7   r   rQ  )r	  r	  r  r	  r   z'contextlib.AbstractContextManager[None]r  r  r   z(tuple[float, float, list[Optional[str]]])r  r(  r   r   )r1  r   r   z	ir.Layout)Wr   r   r   r   r   rs  r  r  propertyr   setterr  r0  r  r  rV   r  r  r  r  r!  r  r  rU  r  r  r  r  r  r  r  rr  r`  r  r   r	  r  r|  r  r  r  r  r  r#  r  r7  rN  r   r]  r  ro  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r	  r#	  r9	  r  r  r	  rK	  rV	  rj  r{	  r	  r	  rf	  rg	  r	  r  r	  r  r   r  r  s   @rj   r  r  
  s   
P9d	# & & ( (7#,"HAPFKZ+#Z,	 6S*4#&$6!F	808	8, (,	*  %	
 
&
> 
>*6
>	
>?DB'8&'8 +'8 	'8
 '8 
'8R
a(&a(/@a(	(a(F>j&j j 
!	jX..`?6 &6  6  
;	6 p,&,/@,	,\7&7/@7	7r.2&.2/@.2MP.2	.2`$&$/@$	$6< < !< B	<
 
<|M&M/@M	M^Y
&Y
/@Y
	Y
v
9(9 )9 	9
 
9v`&`/@`	8`L "*.uM uM !uM 	uM
 $(uM 
uMn3&3/@3	3j-)-)(9-)BS-)	-)f D; !.3*.3
 3
 !3
 	3

 (,3
 $(3
 
 3
j6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
?B	D '1' 
'RBH BH QBH 
"	BHHI -I @JI 	&I V"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@ &	B &D";'"; 2"; 
	";H>0)
 )
 +)
 
	)
V-;X	06-A--A;X-A	-A^@BH949	19J5X%
H Hrr   c                  r  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr S       SS jjrS S jrS!S jrSS jrS"S jrSS jr    S#S jrS$S jr      S%S jr    S&S jr S     S'S jjrSrU =r$ )(r	  i  c                .   > [         TU ]  5         Xl        g ra   )r  rs  r  )r  r  r  s     rj   rs  BaseScheduling.__init__  s    "rr   c                \    U R                   (       a  U R                   R                  5         g g ra   )r  r  r  s    rj   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') rr   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   r9  s     rj   get_backend_features#BaseScheduling.get_backend_features  s
    |rr   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r_  s      rj   r]   BaseScheduling.can_fuse_vertical  
     "!rr   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r_  s      rj   r^  "BaseScheduling.can_fuse_horizontal  r	  rr   c                    g)aE  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.
Fr   r_  s      rj   rV  .BaseScheduling.can_fuse_multi_outputs_template  s     rr   c                @   UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R	                  X5      (       a  [        X5      $ [        U[
        5      (       a  UR                  U5      $ [        R                  X5      $ )z
Fuse two nodes
)	rO  r  rS  r]   r   r  rb   r  r   r_  s      rj   rS  BaseScheduling.fuse  s|     !1!1!3!3-225@@77EE*588677??5))%**588rr   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )r  r  s     rj   r  BaseScheduling.group_fn  r	  rr   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )r  r  epilogue_nodesrU  s       rj   r	  BaseScheduling.codegen_template  s
     "!rr   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )r  r  r  r  s       rj   r  .BaseScheduling.generate_kernel_code_from_nodes  s
     "!rr   c                    [         er	  r  r  s     rj   r	  BaseScheduling.codegen_node  
     "!rr   c                    [         era   r  r  s     rj   r	  *BaseScheduling.codegen_mix_order_reduction  r  rr   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  r  s    rj   r	  BaseScheduling.codegen_sync  r	  rr   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Fr   r  s    rj   r	  BaseScheduling.ready_to_flush  s    
 rr   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  r  s    rj   r  BaseScheduling.flush  r	  rr   c                    [         e)r  r  r  s     rj   r  $BaseScheduling.benchmark_fused_nodes	  
     "!rr   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )r  r  s     rj   r  )BaseScheduling.benchmark_codegened_module  s
    
 "!rr   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   r   r_  s      rj   r  'BaseScheduling.get_fusion_pair_priority  s     rr   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  r  s     rj   r	  %BaseScheduling.benchmark_combo_kernel"  r	  rr   c                    U(       a9  SSK Jn  U" UU5      n[        R                  R                  R                  X$5        g g )Nr   )'set_kernel_post_grad_provenance_tracing)r  r
  rS   rv   r0  write_provenance_debug_handle)r  node_scheduler&  r
  debug_handles        rj   codegen_commentBaseScheduling.codegen_comment+  s<    
 UBL GG  >> rr   r2  )r  zOptional[Scheduler]rP  )r:  r  r   zOrderedSet[BackendFeature]r   r  )r  r9  r   z"tuple[tuple[sympy.Expr, ...], ...])r  rX   r	  r  rU  r  r   r	  ra   r	  )re   z(Union[FusedSchedulerNode, SchedulerNode]r   rQ  )re   r  r   rQ  rR  r	  )r  r   r   r	  r   r	  )r	
  r  r&  r	  r   rQ  )r   r   r   r   rs  r	  r	  r]  r^  rV  rS  r  r	  r  r	  r	  r	  r	  r  r  r  r  r	  r
  r   r  r  s   @rj   r	  r	    s   #*"&"/@"	""&"/@"	"
&
/@
	
9&9/@9	9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	""""""0"	""&/@	"4"	1" &*2 # 
	 rr   r	  )r   z$torch._inductor.codecache.LocalCache)r;  rX   r   r   )r;  rX   r   zOptional[Callable[[Any], Any]])r;  rX   r   rg  )rs  r   r   r   )re   rX   r  r!  r6  re  r   rQ  )r>  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   rQ  )r>  r
  r  r  r   r(  r   rQ  )r   )r  zlist[list[int]]r  r:  r  r7  r   z	list[int])r  r	  r	  r	  r   rQ  r&  rR  )re   z	ir.IRNoder   r	  )re   rX   r   r	  )
__future__r   r  r	  rW  r  r  r  r-  r]  ri  r(  rq  rr  r	  r  r   r   r   r   r   r	   r
   r   r   typing_extensionsr   torch.utils._ordered_setr   r&   r   collections.abcr   r   r   typesr   r   r  torch._inductor.async_compiletorch.utils._pytreerR  _pytreer9  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._sympy.symbolr   r   r    torch.utils._tritonr!   r  r"   r#   r$   r%   r'   analyze_preserves_zero_maskr(   codegen.commonr)   r*   r+   comm_analysisr,   r-   r.   r/   r0   r1   excr2   r3   fx_utilsr4   r5   r6   r7   r8   r9   r:   r   r;   r  r<   r=   runtime.hintsr>   runtime.runtime_utilsr?   r@   rw   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   virtualizedrS   	getLoggerr   r  _logginggetArtifactLoggerrb  r  rd  rT	  r   rY   rV  rZ   r[   r]   	dataclassr  rZ  rX   r  r  r  rF  r  rZ  r  r>  r  r  rp  rc   rD  rM  r   r  r  rL  r  r  r  r  r  r  r  r  r  r  r	  r   rr   rj   <module>r)
     s   "          	     , S S S ' /  <<    $ $ $ 6 ? 7 M > O O * D D D M M ; : 2 $    J ( 7 &    &  !^^--hA
NN44XO  >>;;$  11(LI 34y 4T]t_\ \~ h8 h8 h8V 4_ 4 4t1 t1n 2 2(' #L T"
 
 #
*  *K
*K4*K ,*K 
	*KZW 1 W"5. 5L*% L*^
@	$@ $ 
	,}** }*@PG0 PGfl:!3 l:^b, bP #%+#++  + 	+\0%01C0	08
1 
 
 
> %??, 4$
&`:H `:HFue err   