
    ȅik                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJr  S SKJrJrJrJrJr  S SKJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJr  S S	KJr  S S
KJ r   S SK!J"r"  S SK#J$r$J%r%J&r&  S SK'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/J0r0J1r1  SSK2J3r3  SSK4J5r5J6r6  SSK7J8r8J9r9J:r:  \(       a  S SK;J<r<  SSK0J=r=  SSK>J?r?  SSK@JArA  SSKBJCrC  SSKDJErEJFrFJGrG  SSK1JHrHJIrIJJrJ  SSKKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrU  SSKVJWrWJXrXJYrY  SSKZJ[r[  SS K\J]r]J^r^J_r_J`r`  SS!KaJbrbJcrc  SS"KdJereJfrfJgrgJhrhJiri  \(       a  S S#K;JjrjJkrkJlrl  S S$KJmrm  \R                  " \o5      rp\R                  R                  \oS%5      rs\R                  R                  \oS&5      rt\R                  R                  \oS'5      ru\`" 5       R                  rw\"" / S(Q5      rxS?S@S) jjry\R                   " S* S+5      5       r{ " S, S-\{5      r| " S. S/\{5      r}SAS0 jr~\" S1\]\]S29r\R                   " S3 S45      5       r " S5 S6\_\   \\   5      r " S7 S8\I5      r\R                  " S9S:9 " S; S<5      5       r " S= S>\5      rg)B    )annotationsN)Counter)AnyGenericOptionalTYPE_CHECKINGUnion)TypeVar)metrics)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hashPyCodeCache)	MemoryDepStarDepWeakDep)CallableIRNode)!indexing_dtype_strength_reduction)CoordescTuner)DeviceProperties)
green_textlast_power_of_2yellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_property_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernelSizeHintMultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                l    [         R                  R                  R                  R                  nUb  U$ U $ N)torch	_inductorr   triton	max_tiles)defaultr[   s     V/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesr^   ^   s-    &&--77I!-9:7:    c                     ^  \ rS rSrSr\R                  R                  \R                  R                  S.               S	U 4S jjjr\	\
S
S j5       5       rSS jr\	\
SS j5       5       rSrU =r$ )IterationRangesc   a  
Each range tree represents multiple sets of iteration indexing
in a single tiled dimension in the output kernel.

If you have two loops ranges one (4, 3, 2) and another (4, 6),
then the range tree will be:
        4 (i0)
    3 (i1)  6 (i3)
    2 (i2)
Where i0 is shared between both loops, but then the split into
different indexing vars.  All loop ranges must iterate over
the same number of elements.
)divisorlengthc                  > [         T
U ]  5         Xl        X l        X0l        X@l        XPl        Xpl        Xl        X`l	        Xl
        g rW   )super__init__namevar_list
var_rangesnumelprefixrc   rd   kernelroot)selfrh   ri   rj   rk   rl   rm   rc   rd   rn   	__class__s             r]   rg   IterationRanges.__init__s   s=     		 $
	r_   c                ,    [        U R                  5      $ rW   )r5   rl   ro   s    r]   is_reductionIterationRanges.is_reduction   s     #4;;//r_   c                ,    [        U R                  5      $ rW   )r6   rh   rs   s    r]   symbolIterationRanges.symbol   s    !$)),,r_   c                |    [         R                  " 5        VVs0 s H  u  pX!_M	     nnnX0R                     $ s  snnf rW   )r   itemsrl   )ro   symtrl   prefix_to_symts       r]   r{   IterationRanges.symt   s;     <F;K;K;MN;M<4&,;MNkk** Os   8)	rc   rm   rd   rh   rk   rl   rn   ri   rj   )rh   strri   list[sympy.Symbol]rj   dict[sympy.Symbol, sympy.Expr]rk   
sympy.Exprrl   r~   rm   
SIMDKernelrn   IterationRangesRootreturnNoner   boolr   zsympy.Symbol)r   r   )__name__
__module____qualname____firstlineno____doc__sympySOnerg   propertyr0   rt   rw   r{   __static_attributes____classcell__rp   s   @r]   ra   ra   c   s    . ww{{ % 3	
    " 
 0 0  0- +  +r_   ra   c                     ^  \ rS rSrSr S                     SU 4S jjjrSS jrSS jrSS jrSS jr	    SS jr
SS	 jr    SS
 jrSrU =r$ )r      z
Root of a iteration range tree that represents a single
tiled dimension in the output kernel. It contains multiple
sets of iteration represented with IterationRangesEntry.
c          
        > Uc  0 n[         TU ]  U/ 0 UUUU S9  X@l        0 U l        X`l        U(       a  U R
                  (       a  U	b   eXpl        Xl        Xl        Xl	        g )N)rh   ri   rj   rk   rl   rm   rn   )
rf   rg   indexnodes	pid_cachert   is_loop
tensor_dimgrid_dimhas_zdim)ro   rh   rk   rl   r   rm   r   r   r   r   r   rp   s              r]   rg   IterationRangesRoot.__init__   sx     I 	 	
 
=?
 *3
 t00X5EFF$  r_   c                >    SU R                   < SU R                   S3$ )NzIterationRangesRoot(, z, ...))rh   rk   rs   s    r]   __repr__IterationRangesRoot.__repr__   s    %dii]"TZZLGGr_   c                f    U R                   R                  5        H  nUR                  5         M     g rW   )r   valuescache_clear)ro   nodes     r]   r   IterationRangesRoot.cache_clear   s%    JJ%%'D (r_   c                2    [        U R                   S35      $ )Nr   )r6   rl   rs   s    r]   	index_symIterationRangesRoot.index_sym   s    !T[[M"788r_   c                   [         R                  R                  R                  X-  U R                  5      (       a  [        U R                  5       U5      nO[        U R                  5       X5      nX0R                  ;  a  [        U R                   [        [         R                  R                  5       3UUUU 5      nU[         R                  R                  UR                  5       '   U R                   R#                  UR                  5       5        X R$                  UR                  5       '   X@R                  U'   U R                  U   $ )z6
Lookup a given RangeTreeEntry, creating it if needed
)r<   graphsizevarsstatically_known_equalsrk   r   r   r   r   IterationRangesEntryrl   nextrm   iter_vars_countrange_tree_nodesrw   ri   appendrj   )ro   rc   rd   exprr   s        r]   lookupIterationRangesRoot.lookup   s     7733G4DdjjQQDNN,g6D"4>>#3WEDzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3OODKKM*#JJtzz$r_   c                    [         R                  R                  n/ n[        U5       H'  nUR	                  U R                  X$5      5        X$-  nM)     / [        U5      Q$ rW   )r   r   r   reversedr   r   )ro   lengthsrc   itervarsrd   s        r]   construct_entries%IterationRangesRoot.construct_entries   sT     ''++w'FOODKK89&G ( %(#$$r_   c                j    U R                  U5       Vs/ s H  o"R                  5       PM     sn$ s  snf rW   )r   rw   )ro   r   es      r]   	constructIterationRangesRoot.construct   s+    $($:$:7$CD$Cq
$CDDDs   0c           
       ^^^	^
 SS jmUR                    Vs/ s H,  n[        R                  R                  R	                  U5      PM.     nnU Vs/ s H)  oD(       d  M  UR
                  U R
                  :X  d  M'  UPM+     nnUR                  U4S jS9  [        R                  R                  m/ m	/ m
UU	U
4S jnU H|  n[        R                  R                  R                  UR                  T5      (       d8  U" U R                  T[        UR                  T5      5      5        UR                  mU" U5        M~     [        R                  R                  R                  U R                   T5      (       d,  U" U R                  T[        U R                   T5      5      5        / [#        T	5      Q/ [#        T
5      Q4$ s  snf s  snf )z,Figure out vars from this tree used in indexc                   [         R                  R                  R                  U R                  [
        R                  S9n[         R                  R                  R                  U R                  [
        R                  S9S:H  nX(       + 4$ )z
Gets the key for sorting nodes. When two nodes have the
same divisor, the node with length as 1 should be handled
first so the current divisor is not changed after multiplied
node.length. Returns `not length_is_one_hint` for ascending
sort.
fallbackr=   )r<   r   r   	size_hintrc   r   unbacked_symint_fallbackrd   )rS   divisor_hintlength_is_one_hints      r]   get_sort_key8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   s}     77++55		F$C$C 6 L   **HHv'F'F +    !"899r_   c                   > T" U 5      $ rW    )rS   r   s    r]   <lambda>4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s	    ar_   keyc                   > TR                  U R                  5       5        TR                  U R                  5        TU R                  -  mg rW   )r   rw   rd   )r   rc   
index_varssizess    r]   add/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+Gr_   )rS   r   r   ztuple[int, bool])free_symbolsr<   rm   r   getrl   sortr   r   r   r   r   r   rc   r   r   rk   r   )ro   r   sr   nr   r   rc   r   r   r   s          @@@@r]   vars_and_sizes"IterationRangesRoot.vars_and_sizes   sR   
	:& <A;M;MN;Ma**..q1;MN!CEqQ188t{{+BEC

0
1''++
	, D77##;;DLL'RRDKK$,,)HIJ,,I  ww77

GLLGXdjj'%BCD&*%&(:(5/(:::/ OCs   3F=
GG;G)r   r   r   r   r   r   r   rW   )rh   r~   rk   r   rl   r~   r   intrm   r   r   Optional[dict[str, str]]r   r   r   Optional[int]r   r   r   r   r   r   r   r~   r   r   r   )rc   r   rd   r   r   r   )r   list[sympy.Expr]r   zlist[IterationRangesEntry])r   r   r   r   )r   r   r   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   r   s   @r]   r   r      s     /3)!)! )! 	)!
 )! )! ,)! )! ")!  )! )! 
)! )!VH9 .%'%	#%E/;/;	4/; /;r_   r   c                     ^  \ rS rSr            SU 4S jjrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rU =r$ )r   i,  c                  > [         TU ]  UUR                  U-  UR                  UR                  UR
                  UUUR                  UR                  S9	  XPl        [        R                  " S 5      " U R                  5      U l        X@l        g )N)	rh   rk   ri   rj   rl   rc   rd   rm   rn   )rf   rg   rk   ri   rj   rl   rm   rn   parent	functools	lru_cache_codegencodegenr   )ro   rh   rc   rd   r   r   rp   s         r]   rg   IterationRangesEntry.__init__-  sx     	,,'__((==== 	 
	
  **40?	r_   c                    SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )NzIterationRangesEntry(r   ))rh   rc   rd   r   rj   rs   s    r]   r   IterationRangesEntry.__repr__D  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrr_   c                N   ^ U4S jU l         S U R                   l        TU l        g )Nc                    > T $ rW   r   )rh   s   r]   r   /IterationRangesEntry.set_name.<locals>.<lambda>H  s    tr_   c                     g rW   r   r   r_   r]   r   r   I  s    4r_   )r   r   rh   )ro   rh   s    `r]   set_nameIterationRangesEntry.set_nameG  s    ##/ 	r_   c                8    U R                   R                  5         g rW   )r   r   rs   s    r]   r    IterationRangesEntry.cache_clearL  s      "r_   c                X    [         R                  R                  U 5        U R                  $ rW   )r<   rm   codegen_iteration_ranges_entryrh   rs   s    r]   r   IterationRangesEntry._codegenO  s    	//5yyr_   c                   / n[        U R                  [        R                  5      (       a  U$ [        U R                  [        [
        45      (       d   [        U R                  5      5       eU R                  R                  SS   H{  n[        U[        R                  [        R                  45      (       a  M4  UR                  n[        U5      S:  d  MQ  [        S U 5       5      (       d  Mj  UR                  U5        M}     U$ )Nr=   r   c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frW   )r   r   SIZE.0r   s     r]   	<genexpr>8IterationRangesEntry.precomputed_args.<locals>.<genexpr>\  s!      ,:AQN1dii00's   '))
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )ro   precomputed_argsargsymbolss       r]   r  %IterationRangesEntry.precomputed_argsS  s    -/dii..##$))h%@AAR4		?RA99>>!"%CcEMM5<<#@AA**w<!# ,:A, ) ) %++C0 &  r_   c                ,    [        U R                  5      $ rW   )hashrh   rs   s    r]   __hash__IterationRangesEntry.__hash__b  s    DIIr_   c                b    [        U[        5      (       d   eU R                  UR                  :H  $ rW   )r  r   rh   )ro   others     r]   __eq__IterationRangesEntry.__eq__e  s)    %!56666yyEJJ&&r_   )r   r   rh   r   )rh   r~   rc   r   rd   r   r   r   r   ra   r   r   r   )rh   r~   r   r   r   )r   r   r   r   )r  objectr   r   )r   r   r   r   rg   r   r   r   r   r  r  r  r   r   r   s   @r]   r   r   ,  sk      	
    
.s
# ' 'r_   r   c                    U [        S5      :X  a  gU [        S5      :X  a  g[        R                  " U 5      (       a  g[        U 5      $ )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r]   constant_reprr  j  s<    e	%-		E		;r_   CSEVariableType)boundr\   c                  4    \ rS rSr% S\S'   S\S'   S\S'   Srg)	PartialAccumulateiw  r~   buffer_namereduction_typer   r  r   N)r   r   r   r   __annotations__r   r   r_   r]   r!  r!  w  s    Jr_   r!  c                    ^  \ rS rSr% Sr\rS\S'   S\S'   SrS\S'   S	\S
'        S>               S?U 4S jjjr	S@S jr
S r\\SAS j5       5       rSBS jrSCS jr\SDS j5       rSES jr            SFS jrSGS jrSHS jrSIS jrSES jrSES jrSJS jrSAS jrS@S jrSKS jrSDS jrSDS jrSLS jr      SMS  jr       SMS! jr!SNS" jr"SOS# jr#\$      SPS$ j5       r%\&\'RP                  RR                  4       SQS% jj5       r*\&\'RP                  RR                  4       SRS& jj5       r+    SSS' jr,\&      STS( j5       r-SUS) jr.SUS* jr/SVS+ jr0    SLS, jr1SWS- jr2SXS. jr3SYS/ jr4S0 r5 SZ       S[S1 jjr6\7Rp                        S\S2 j5       r9S]S3 jr:\$S4 5       r;S^S5 jr<S6 r=S7 r>S8 r?S9 r@S: rAS; rBS_S< jrCS=rDU =rE$ )`r   i~  zg
Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
zCallable[[sympy.Expr], str]sexprkexprFr   allow_block_ptrr~   kernel_namec                  >^  Uc  0 n[         TT ]  5         UT l        UR                  5       T l        [        5       T l        [        5       T l        UR                  5        VV	s0 s H/  u  pU[        R                  R                  R                  U	5      _M1     sn	nT l        / T l        0 T l        [         R"                  " 5       T l        UR'                  5       T l        Ub  UOT R+                  5       T l        UT l        UT l        Ub  UOT R3                  5       T l        UT l        T R9                  5       T l        S T l        [         R"                  " 5       T l        ST l         [B        RD                  RF                  (       a  T R                  RH                   Hv  n
[K        U
[L        RN                  5      (       d  M$  [K        U
RP                  [R        RT                  5      (       d  MO  U
RP                  RW                  5       S:X  d  Mo  ST l           O   [X        RZ                  SU 4S jj5       nUT l.        T R_                  U5        ST l0        / T l1        g s  sn	nf )NFdotTc                   > [         R                  R                  R                  U TR	                  5       5      n TR
                   H  nTR                  X5      n M     TR                  U 5      $ rW   )r<   r   r   simplify_with_rangesrj   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treero   s     r]   simplify_indexing.SIMDKernel.__init__.<locals>.simplify_indexing  sY    GG$$99%ARSE((44UA ) 66u==r_   r   )r   r   )2rf   rg   featuresget_mutations	mutationsr3   bodyindexing_coderz   r<   r   r   simplifynumelsr.  r   	itertoolscountr   rt   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scorestilingshould_use_persistent_reductionpersistent_reductionmix_order_reductionwant_no_x_dimno_x_dimr   store_output_ctris_native_matmulr   rZ   native_matmulnode_scheduler  r   SchedulerNoder   r   ComputedBufferget_reduction_typer   cacher2  initialize_range_treersplit_sizesaved_partial_accumulate)ro   rA  r4  r   override_persistent_reductionoverride_cooperative_reductionr@  rD  rl   valr   r2  rp   s   `           r]   rg   SIMDKernel.__init__  s    I !//1"$	+-FLlln
FT{vFAGG$$--c22n
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L-3 -8 *557 	!
 *= **,(, ) 1 %==&&33tY%<%<=="499b.?.?@@		446%?,0D) 4 
	> 
	> "3""9-AC%a
s   #6Ic                    SU S3$ )Nz<STORE_OUTPUT_>r   )ro   is     r]   _get_store_output_subgraph_name*SIMDKernel._get_store_output_subgraph_name  s    s!$$r_   c                j    [        U R                  5      n[        R                  " US-
  SS9U l        U$ )Nr=   )startstep)r   rG  r;  r<  )ro   totals     r]   get_store_output_count!SIMDKernel.get_store_output_count  s.    T**+ )eaia Hr_   c                :    [        S U R                   5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frW   )r5   )r   rl   s     r]   r   0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I[6&v..[   )sumr:  rs   s    r]   num_reduction_dimsSIMDKernel.num_reduction_dims  s     IT[[IIIr_   c                    [         erW   NotImplementedError)ro   dtypes     r]   dtype_to_strSIMDKernel.dtype_to_str      !!r_   c                6    U R                   R                  5       $ rW   )r4  select_index_dtypers   s    r]   get_index_dtype_as_torch_dtype)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11r_   c                @    U R                  U R                  5       5      $ rW   )rl  rq  rs   s    r]   index_dtypeSIMDKernel.index_dtype  s      !D!D!FGGr_   c                    gNFr   rs   s    r]   rE  SIMDKernel.want_no_x_dim      r_   c                  ^ [        U4S j[         5       5      nU(       + =(       d    U(       + nS	S jn/ SQn	[        [        U	5      5      n
SS/nU(       a  UnOU(       a  U
nOX-   nU" X5      nU" U	[        5      n/ n[	        U5       H|  u  nn[        U5      nUR                  U5      nUR                  U5      nUc  UOUnUR                  [        U S3TU   UUU UU=(       a    U R                  (       + UUST;   S9
5        M~     U$ )
Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frW   r   )r   rl   r:  s     r]   r   3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
!-v61AFF   		c                d   ^ [        U4S jU  5       5       VVs0 s H  u  p#X2_M	     snn$ s  snnf )Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frW   r   )r   rT  masks     r]   r   OSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U#3PT33#r}  )	enumerate)seqr  idxrT  s    `  r]   filtered_index_map<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s4    )22U#2U)U)UXS)U  s   ,)rS   rR   rQ   rT   rU   r   rQ   )r   r   r   r   r   )r   zdict[Any, int])
r   all_prefixeslistr   r  r5   r   r   r   rC  )ro   r   r=  rt   r:  rF  active_prefixesno_r_dimr  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr.  rX  rl   r   r   r   s       `                r]   construct_range_trees SIMDKernel.construct_range_trees  s*    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/@K ,KI))\B"?3IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F] 4& r_   c                    U R                  UU R                  U R                  R                  5       U R                  U R
                  5      nU R                  R                  U5        g rW   )r  r=  r4  rt   r:  rF  r.  extend)ro   r   r.  s      r]   rO   SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,r_   c                    g)zZ
Hook called right before codegen with every index that will be
used in the fused kernel.
Nr   )ro   indicess     r]   finalize_indexingSIMDKernel.finalize_indexing'  s    r_   c                p    U R                   nSU l          U R                  XU5      X@l         $ ! X@l         f = frw  )r=  store)ro   rh   r   r  priors        r]   store_reductionSIMDKernel.store_reduction-  s5    %% %	*::d51$)!E!s   - 5c                    grw  r   rs   s    r]   r>  +SIMDKernel.should_use_cooperative_reduction5  ry  r_   c                    grw  r   rs   s    r]   rB  *SIMDKernel.should_use_persistent_reduction8  ry  r_   c                t    [        [        R                  R                  S U R                   5       5      5      $ )Nc              3  T   #    U  H  oR                   R                  5       v   M      g 7frW   )rj   rz   r   r1  s     r]   r   (SIMDKernel.var_ranges.<locals>.<genexpr>=  s"      *4DD%%''4Ds   &()dictr;  chainfrom_iterabler.  rs   s    r]   rj   SIMDKernel.var_ranges;  s4    OO)) *484D4D* 
 	
r_   c                :    [        S U R                   5       5      $ )Nc              3  P   #    U  H  n[        UR                  S L5      v   M     g 7frW   )r   r   r  s     r]   r   0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>C  s#     Q@P3td233@Ps   $&)re  r.  rs   s    r]   triton_tensor_ndimSIMDKernel.triton_tensor_ndimB  s    Q@P@PQQQr_   c                \    S/U R                  5       -  nSX!'   SSR                  U5       S3$ )Nr   :[r   ])r  join)ro   rX  r   s      r]   indexing_size_strSIMDKernel.indexing_size_strE  s7    42244499U#$A&&r_   c                   S/U R                  5       -  nU R                   H_  nUR                  c  M  UR                  (       a  U R                  (       d  M6  UR
                  R                  5        S3XR                  '   Ma     U$ )N1BLOCK)r  r.  r   rt   r=  rl   upper)ro   r   r1  s      r]   dense_size_listSIMDKernel.dense_size_listJ  sp    //11$$D&$$(=(=(=,0KK,=,=,?+@)Foo& % r_   c                    UR                   nUR                  c  U R                  5       nU SU S3$ S/U R                  5       -  nSXAR                  '   SR	                  U5      nU SUR                  5        SU S3nU$ )	Nzmask = tl.full(z, True, tl.int1)r   r  r   zmask = tl.full([zBLOCK], True, tl.int1)[r  )rl   r   dense_size_strr  r  r  )ro   entryrS   sizestrr   suffixouts          r]   create_constant_maskSIMDKernel.create_constant_maskT  s    LL#))+GSy0@AA42244"%5!#AGGI;.EfXQO
r_   c                L    U R                  5       nSSR                  U5       S3$ )Nr  r   r  )r  r  ro   r   s     r]   r  SIMDKernel.dense_size_str_  s)    $$&499U#$A&&r_   c                   [        U[        5      (       d  U$ UR                  S   nU R                  R	                  U5      =nc  U$ [        XUR                  05      n[        R                  R                  R                  U5      n[        UUR                  R                  5       UR                  R                  [        R                  R                   UR                  R"                  5      R%                  5       05      $ Nr   )r  r   r  r   r   r8   r   r<   r   r   r0  rn   r   r   r   r   r   rk   rw   )ro   r   rS   	tree_node	new_indexs        r]   r0  )SIMDKernel.combine_modular_indexing_pairsc  s    %11LJJqM..22155I>Lu)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
r_   c                    [         R                  R                  R                  U5      =n(       a  Uu  pE[	        U R                  XB5      U5      $ U R                  X5      $ rW   )r<   r   r   expand_floor_divr   _combine_contiguous_dims)ro   r   r1  
expand_resr  denominators         r]   r/  "SIMDKernel.combine_contiguous_dimsu  sU     ))::5AA:A%/"ID99)JKXX00==r_   c                   [        U[        R                  [        R                  45      (       a  U$ UR	                  U5      u  p4[        U5      S::  a  U$ [        R                  R                  R                  X4[        U/X45      5      u  pVnXT:X  a  U$ UR                  U5      n[        U[        [        X6" U5      5      5      5      n	U	$ )z9
More aggressive simplification to merge contiguous dims
r=   )r  r   r  r  r   r  r<   r   r   _simplify_loopsr@   r   r8   r  zip)
ro   r   r1  r   r   	new_sizesreindex_prunenew_index_varsr  s
             r]   r  #SIMDKernel._combine_contiguous_dims~  s     eemmU\\:;;L //6
u:?L%&WW%5%5%E%E7S&
"	F L	2ud3z7>;R+S&TU	r_   c                   ^ ^ T R                   S   R                  =(       d    T R                  m[        R                  U U4S j5       nU" 5       $ )Nc               3    >#    T R                   R                  5       (       d  T R                  (       a   eS v   g T(       a  T R                  5         ST l         S v   T(       a  T R                  5         ST l        g ! ST l        f = f7f)NFT)r4  rt   r=  codegen_body)ro   should_flushs   r]   ctx)SIMDKernel.disable_reduction.<locals>.ctx  sn     ==--//0000 !!#$)D!-%%'(,%%s   AB	A= 5B	=	BB	)r.  r   r?  
contextlibcontextmanager)ro   r  r  s   ` @r]   disable_reductionSIMDKernel.disable_reduction  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ur_   c                    [        U5      [        U R                  5      :X  d   e[        XR                  5       VVs/ s H  u  p#UR                  U5      PM     snn$ s  snnf rW   )r  r.  r  r   )ro   r   rd   rangess       r]   
set_rangesSIMDKernel.set_ranges  s^    7|s4#3#34444 #&g/?/?"@
"@ V$"@
 	
 
s   Ac                  ^^^^ [        S U 5       5      (       a  U  Vs/ s H  n/ PM     sn/ 4$ [        R                  R                  mU  Vs/ s H  n/ PM     snmU  Vs/ s H  nTR	                  U5      PM     snm[
        R                  " 5       mSUUUU4S jjn      SS jn/ nSnU GHt  n	/ n
U	 GHV  nTR                  US5      (       a  U
R                  S 5        M/  U[        T5      :  aJ  TR                  TU   S5      (       a0  US-  nU[        T5      :  a  TR                  TU   S5      (       a  M0  [        T5      S:H  =(       a    TS   S:H  nUS	-   [        T5      :  a  TR                  UTU   TUS-      -  5      (       a  U(       a}  TR                  UTU   TUS-      -  5      (       d  [        eTU   nTUS-      n[        XU-  5      nU
R                  U" X/U" X5      U" US-   U5      U" US	-   U5      /5      5        GM^  US-   [        T5      :  a  TR                  UTU   5      (       d$  TR                  [        UTU   5      S5      (       ak  TR                  UTU   5      (       d  [        UTU   5      eTU   n[        UTU   5      nU
R                  U" U/U" X5      U" US-   U5      /5      5        GM  U[        T5      :  d  GM+  U
R                  [        R                  " U" X5      5      5        GMY     UR                  U
5        GMw     [        S
 T 5       5      (       d   ST SU 35       eTU4$ s  snf s  snf s  snf )Nc              3  >   #    U  H  n[        U5      S :H  v   M     g7fr   Nr  )r   rd   s     r]   r   5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6gFs6{ags   c                   > TR                  U5      nTR                  TU    U5      (       d  [        TU    U5      e[        TU    U5      TU '   TU    R	                  U5        [        T5      $ rW   )r9  statically_known_multiple_of	CantSplitr   r   r   )rX  r   
new_ranges	remainingsv	var_counts     r]   	add_range5SIMDKernel._split_iteration_ranges.<locals>.add_range  si    ;;t$D229Q<FF	!d33#IaL$7IaLqM  &	?"r_   c                V   ^ ^ [        T5      [        T 5      S-   :X  d   eSUU 4S jjnU$ )z`
Builds the nested expression:
  ((...((s1*v[i1] + v[i2]) * s2 + v[i3]) ... ) * sk + v[i(k+1)])
r=   c                Z   > U TS      n[        TTSS  5       H  u  p#X!-  X   -   nM     U$ )Nr   r=   )r  )	flat_varsr   r   r  idxsr   s       r]   getterISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s=     a)!%ab2FA8in4D 3r_   )r  r   r   r   r  )r   r  r  s   `` r]   make_combined9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s0     t9E
Q...  Mr_   r   r=   c                6    [         R                  R                  $ rW   )r   r   Zero)_s    r]   r   4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLLr_      r  r   c              3  z   #    U  H1  n[         R                  R                  R                  U5      S :H  v   M3     g7f)r=   Nr<   r   r   r   r   s     r]   r   r  ;  s*     Iy!177##--a0A5ys   9;zfailed to set ranges  )rX  r   r   r   r   r   )r   r   r  z	list[int]r   z(Callable[[list[sympy.Expr]], sympy.Expr])r  r<   r   r   r9  r;  r<  r   r   r  statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_getterssizeis_bmm_then_pwsize1size2size3r  r  r  r  s                   @@@@r]   _split_iteration_ranges"SIMDKernel._split_iteration_ranges  s    6g666$*+F5BF+R//WW:@-A&Qb&-A
-34VR[[^V4	OO%		# 	#	#	+4	5	" !##LN$--dA66"))*@A#c)n49S9Sm,: :
 "Q&M $c)n49S9Sm,: :$ "%Y1!4!K2!9K!A%I6..i6=STCT9UU  ' ::i6=STCT9UU  (%m4E%ma&78E$T5=9E"))%"N )- ? )-!*;U C )-!*;U C	 #Q&Y7**4=1IJJ **8D)M:R+SUVWW ::i6  (i.FGG%m4E$T9]+CDE"))%"G )- ? )-!*;U C %s9~5&--$//	-0NOq %v "((8{ $~ IyIII 	
#I;ay9	
I 000K , .B4s   MM"Mc                *   [         R                  R                  n[        US   5      S:X  af  UR	                  U[
        R                  R                  5      (       d7  UR	                  [        U5      [        US   5      U-  5      (       a  US   U/4$ U$ )z1Fill in the reduction numel of lengths if missingr=   r   )	r<   r   r   r  r   r   r   r   r7   )clsr  r   reduction_numelr   s        r]   prepare_split_iteration_lengths*SIMDKernel.prepare_split_iteration_lengths@  s     77##wqz?a00%''++NN00f%gaj)O; 
 AJ 122r_   c                l    U R                  XU5      n U R                  X5        g! [         a     gf = fNTF)r  r  r  )r  r  r   r  s       r]   is_compatibleSIMDKernel.is_compatibleT  s>     55fW	''8 		s   & 
33c                X   U R                    Vs0 s H  o"R                  UR                  _M     nnU R                  (       d7  U H1  n[	        U5      (       d  M  [
        R                  R                  X4'   M3     / UR                  5       QnU R                  XQU R                  5      $ s  snf )a  
Split and set iteration ranges for the kernel based on the provided lengths.

This method maps the kernel's tiling structure to the node's iteration space,
handling both pointwise and reduction dimensions appropriately.

Args:
    lengths: A sequence of sequences of symbolic expressions representing
            the sizes of different dimensions for each node.

Returns:
    A list of lists of symbolic expressions representing the mapped
    iteration variables for each dimension.
)r.  rl   rk   r=  r5   r   r   r   r   map_kernel_groups_to_node_sizesr  )ro   r   rtrA  rl   r  s         r]   split_and_set_rangesSIMDKernel.split_and_set_rangesc  s    $ 150@0@A0@"))RXX%0@A $$ &v..%*WW[[FN !
 $6==?# 33FT__UU Bs    B'c           
     T   [        U5      [        U5      :X  a%  [        S [        X!5       5       5      (       a  U" U6 $ U R                  X5      u  pE/ [        R
                  R                  U" U6 5      QnU VVs/ s H  ow Vs/ s H
  o" U5      PM     snPM     snn$ s  snf s  snnf )aY  
We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

To do this we need to split up the iteration space of i0 into something like:
    for i1 in s0:
      for i2 in s1:
        i0 = i1*s1 + i2
        ....

This function matches and resplits lengths to the groups of
this kernel to enable tiled + non-tiled fusions.
c              3     #    U  H?  u  p[         R                  R                  R                  [	        U5      U-
  5      S :H  v   MA     g7fr  r<   r   r   r9  r7   )r   rS   r  s      r]   r   =SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s=      /
, GG%%mA&6&:;q@,s   AA	)r  r  r  r  r;  r  r  )	r  r  r   r  r  r  r   fnsfns	            r]   r#  *SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
G,/
 ,
 ,
 w'',/,G,G,X)
LY__22:z3JKL8MN8M,"H,8MNN,Ns   :	B$BB$B$c                6    [        U[        R                  5      $ rW   )r   r   TMPro   r   s     r]   is_indirect_indexingSIMDKernel.is_indirect_indexing  s    "5$((33r_   c                  ^ U R                  U5      (       a  gS/[        U R                  5      -  nUR                   Hn  nX0R                  ;  a  M  U R                  U   n[        UR                  [        5      (       d   eX$R                  R                  ==   UR                  -  ss'   Mp     [        R                  R                  R                  m[        U4S j[        X R                  R!                  5       5       5       5      $ )NFr=   c              3  J   >#    U  H  u  pT" U5      T" U5      :g  v   M     g 7frW   r   )r   	idx_range
iter_ranger9  s      r]   r   ,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s*      
)P%	 Y8J#77)Ps    #)r1  r  r:  r   r   r  r   r   r   rd   r<   r   r   r9  anyr  r   )ro   r   index_numelsrw   r  r9  s        @r]   is_broadcastedSIMDKernel.is_broadcasted  s    $$U++sS--((F222))&1Eell,?@@@@++,<, ) 77##,, 
),\;;;M;M;O)P
 
 	
r_   c                    [        U[        5      (       a)  SSR                  [        U R                  U5      5       S3$ U R                  U R                  U5      5      $ )a`  
Convert an index expr to a string that can be used in output code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

Index expressions often need to be passed in as arguments to the triton kernel.
Rename_indexing and codegen_indexing keep track of the needed indices and add
new parameters to the function signature.
r  r   r  )r  r  r  mapindex_to_strr'  rename_indexingr0  s     r]   r>  SIMDKernel.index_to_str  sQ     eT""tyyT%6%6!>?@BBzz$..u566r_   c                   U R                  U5      n[        U[        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       d-  [        UR                  [        R                  5      5      (       a3  UR                  [        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       a  UR                  [        R                  5       Ho  nUR                  n[        U5      S:  d  M   [        S U 5       5      (       d  M9  U[        R                  R                  R                  U5      0n[        X5      nMq     U R                  U5      n[        U[         5      (       d  UOUR"                  S   nU R%                  U5      $ )Nr   c              3  v   #    U  H/  n[        U[        R                  [        R                  45      v   M1     g 7frW   )r   r   r   PRECOMPUTED_SIZEr   s     r]   r   .SIMDKernel.prepare_indexing.<locals>.<genexpr>  s0      ,$ #1tyy$2G2G&HII$s   79)r2  r8   r<   r   r   precomputed_replacementsr  atomsr   floorceilingsubsr   r  lookup_precomputed_sizer  r   r  codegen_indexing)ro   r   ar
  replacements
simp_indexs         r]   prepare_indexingSIMDKernel.prepare_indexing  sQ    &&u-5!''"2"2"K"KLu{{5;;'((CEMM0J,K,KJJqww//HHIE u{{5==)**[[/ ..w<!# ,$, ) ) %&qww'7'7'O'OPQ'R#SL&u;E 0 ++E2
 )X>>JJOOTUDV 	 $$Z00r_   c                    U R                    Vs/ s H(  oR                  (       a  U R                  (       d  M&  UPM*     sn$ s  snf rW   )r.  rt   r=  )ro   ts     r]   active_range_treesSIMDKernel.active_range_trees  s6    ''
'!~~AVAVA'
 	
 
s
   %AAc                8   [         R                  R                  R                  XR	                  5       5      n[        UR                  [        S9 H  nX R                  ;   d  M  0 nU R                  U   R                  5        H.  n[         R                  R                  R                  U5      X4'   M0     [        U5      S:  a5  [        U R                  U   R                  U5      U R                  U   l        U R                  U   R                  5         M     U$ )Nr   r   )r<   r   r   r-  rj   sortedr   r~   r   r  rJ  r  r8   r   r   )ro   r   symrM  pss        r]   rK  SIMDKernel.codegen_indexing  s    ww44T??;LM$++5C+++  "//4EEGB'(ww'7'7'O'OPR'SL$ H|$q(6@--c277$7D))#.3 %%c*224 6 r_   c                    [        S5      e)NzNYI: codegen_nan_checkri  rs   s    r]   codegen_nan_checkSIMDKernel.codegen_nan_check  s    !":;;r_   c                    [         R                  R                  n[        U R                  R
                  5       H  nUR                  U5        M     g rW   )r<   r   wrapper_coder   r  workspace_argsgenerate_workspace_deallocation)ro   wrapperwss      r]   deallocate_workspaces SIMDKernel.deallocate_workspaces  s8    ''&&499334B33B7 5r_   c                    [        S5      e)NzNYI: call_kernelri  )ro   rh   r   deallocate_wss       r]   call_kernelSIMDKernel.call_kernel  s     ""455r_   c              #     #    U R                   nU R                  nU(       a  [        R                  " X5      n[        R
                  " U5      nXl         X l         Uv   X0l         X@l        g! X0l         X@l        f = f7f)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr:   logical_andr;   _unwrap)ro   r  r  r  	prior_vals        r]   
mask_loadsSIMDKernel.mask_loads  sj     
 $$	??4/D!!$' 	)J#O( $O(s   AA=A, A=,A::A=c                &   U R                   R                  5        VVs0 s H  u  p#X#R                  _M     nnn[        X5      n0 nU R                   H5  n[        UR                  5      n[        XXS05      [        XXS05      -
  Xh'   M7     U$ s  snnf )a  
This gets the stride of the index for each of the tiling variables
(technically, it does it at index 0)

For example, if
xindex = x0 + 512*x1 + 1024*r0
x0 = (xindex//512)
x1 = (xindex % 512)
r0 = rindex // 1024

this function would return
{xindex: 512, rindex: 1024}
r=   r   )r   rz   r   r8   r.  r6   rh   )	ro   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r]   get_strides_of_loadSIMDKernel.get_strides_of_load&  s     8<7L7L7R7R7T U7TtqFF7T U'E**J":??3A#$6A?*"FC GJ +
  !Vs   Bc                d    [        U[        5      (       a  [        [        X5      5      $ U " U5      $ rW   )r  tupler=  )r,  r  s     r]   _map_tuple_or_scalarSIMDKernel._map_tuple_or_scalar>  s(    eU##R((%yr_   c                    [         R                  " U R                  R                  5       Vs/ s H  nUR	                  5       PM     nn[        [        S U5      5      $ s  snf rW   )rH   
only_nodesr4  rJ  estimate_flopsre  filter)ro   r   flopss      r]   r  SIMDKernel.estimate_flopsD  s[     +55dmm6Q6QR
R !R 	 
 6$&''	
s   Ac           	     V   / n[        [        U R                  R                  R	                  5       5      5      nU R                  R                  5       u  p4  nU R                  R                  5       n[        R                  R                  R                  [        U R                  R	                  5       5      [        R                  S9n[!        U5       GHH  u  pxX;  a  UR#                  S5        M  [        R                  R%                  U5      n	[        R                  R                  R                  U	[        R                  S9n
X:  a  [&        [(           " 5       nSnXX    HT  n[+        U[,        [.        45      (       a  UR1                  SU 35        US-  nM9  UR1                  UR2                  5        MV     [        U5      U-  nOU
n[        R                  R5                  U5      n[7        U5      nUR#                  UU-  S[9        Xr:  5      -   -  5        GMK     [;        U5      $ )a  
Try the best to estimate the total size (in bytes) of the
kernel's inputs and outputs, which is used for estimating the memory
throughput of this kernel. This information is used for checking how
far we are from the peak memory bandwidth. It's important that
we want to avoid overestimating the sizes of the inputs and outputs,
because it can wrongfully give us a very large memory traffic value,
which may be even larger than the theoretical bandwidth and thus
become very misleading. This is particularly problematic for cases
where we slice some inputs. In those cases, we should only count
the size of the "slices" instead of the original inputs, because
only the slices contribute to the real memory traffic.
r   r   no_index_dep_r=   )r  r9   r  inplace_buffersr   python_argdefsr4  buf_accessesr<   r   r   r   r7   r:  r   r   r  r   	get_numelr   r   r  r"   r#   r   r   	get_dtyper2   r   re  )ro   nbytesninplace_argsr  	call_argsr  	out_numelrX  r	  	arg_numelbuf_sizer  no_index_dep_countdeprk   rk  
dtype_sizes                    r]   estimate_kernel_num_bytes$SIMDKernel.estimate_kernel_num_bytesK  s    F499#<#<#C#C#EFG!YY557a}}113 GG$$..$++,,./44 / 
	  	*FA &a ))#.Iww''11F$C$C 2 H # %S/+%&"',C!#'9::m4F3G$HI*a/*CII. - Gy0 GG%%c*E'.JMM%*,C8I4J0JKL? +@ 6{r_   c           	        [        U R                  R                  5      S:X  aG  [        U R                  R                  5      S:X  a$  [        U R                  R                  5      S:X  a  gU R                  R                  5       u  p#pESnU GHr  n[        R                  R                  U5      nU(       d  M,  UR                  5       n	[        U	R                  5      S:X  d  MW  [        U	R                   V
s/ s H  oS:X  d  M
  U
PM     sn
5      S:X  a  M  [        R                  " U	R                  5      nUc  UnM  Xk:w  d  M  [        SU S3SU S	U 3-   5      n[        R!                  U5        U Vs/ s Ht  n[        R                  R                  U5      (       aK  [        R                  " [        R                  R#                  U5      R                  5       R                  5      OSPMv     nnU Vs/ s H`  n[        R                  R                  U5      (       a7  [        R                  R#                  U5      R                  5       R                  OSPMb     nnU Vs/ s HE  nU[        R                  R$                  ;   a  S
O!U[        R                  R&                  ;   a  SOSPMG     nnU V
s/ s H  oR(                  PM     nn
[        SU SU SU 3SU SU S3-   5      n[        R!                  U5          g   [+        SU S35      n[        R!                  U5        gs  sn
f s  snf s  snf s  snf s  sn
f )zZ
Print message if the kernel have mixed layout inputs.
Only care about 4D tensor for now.
r=   r   Nr  r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r  r  input_buffersoutput_buffersr  r  r<   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider,   logwarning
get_buffergraph_inputsname_to_bufferrh   r*   )ro   r)  argdefsr  
_signaturer  uniform_stride_orderarg_namebuflayoutrS   stride_ordermsgrh   stride_order_list	size_listsource_listargdef_namess                     r]   warn_mix_layoutSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#!H''((2C^^%F6;;1$6;;9;aq&;9:a?!226==A'/+7()9%01E0FF^_l^<}EFC KK$ %.) %.D 7711$77 ++GG..t4??AHH "	"
 %. & ) %.	! %.D 7711$77 **40;;=BB!" %.	  ! %.# %.D	  177#7#77 %  177#9#99 2!	"
 %.   # 5<#<GqFFGL#<%(nYK|\m[no&ykk]"MNC KK$a "b 3K=@TU
 	C[ :)!# $=s'   6	L(
L(
5A;L-6A'L2#AL75L<c                   [         R                  " XSU5      nSU l        [         R                  " U R                  R
                  U5      n[         R                  " X45      nSU l        [         R                  " X%5      n[         R                  " Xf5      n[         R                  " XSU5      n[        R                  " XXU45      $ )Nre  FT)r:   	reductionr=  
index_exprr4  r  truedivsubmulr;   rm  )	ro   rk  r  sum_rnumelmeandxdx2m2s	            r]   welford_reduce_fallback"SIMDKernel.welford_reduce_fallback  s    }}U5%8 % = =uE{{4( $WWU!ggbo]]54!!4V"455r_   c                    [         R                  " XSU5      n[         R                  " X#5      n[         R                  " U5      n[         R                  " XSU5      n[        R
                  " X645      $ )Nmaxre  )r:   r  r  expr;   rm  )ro   rk  r  vmaxr  r  vsums          r]    prepare_softmax_twopass_fallback+SIMDKernel.prepare_softmax_twopass_fallback  sT    }}U5%8gge"ggcl}}U5#6!!4,//r_   c                    [         erW   ri  rs   s    r]   codegen_kernelSIMDKernel.codegen_kernel  rn  r_   c                    g rW   r   rs   s    r]   r  SIMDKernel.codegen_body      r_   c                    g rW   r   )ro   r  s     r]   r   )SIMDKernel.codegen_iteration_ranges_entry  r  r_   )rj  rk  r7  r   r?  r4  r8  r=  rH  r   rD  r6  rF  r:  rC  r   r.  rP  rQ  r2  rG  rA  r@  )NNNNF)rA  dict[str, sympy.Expr]r4  rI   r   r   rR  Optional[bool]rS  r  r@  Optional[dict[str, sympy.Expr]]rD  r   r   r   )rX  r   r   r~   r  )rk  torch.dtyper   r~   )r   r  r   r   )r   r   r=  r   rt   r   r:  r  rF  r   r   list[IterationRangesRoot])r   zdict[str, str]r   r   )r  Sequence[sympy.Expr]r   r   )rh   r~   r   r   r  r?   r   r   )r   r   )r   z	list[str])r   r   r   r   )r   r   r1  r   r   r   )r   z'contextlib.AbstractContextManager[None])r   r   r   r   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]r   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r  r   r  r  r   r   r  )r  r  r   r  r  r   r   r   )r   r  r   list[list[sympy.Expr]])r  r  r   r  r   r  )r   r   r   r   )r   r   r   r~   )r   r  )r   r   r   r   r   )NT)rh   r~   r   zOptional[IRNode]rf  r   r   r   )r  zUnion[str, OpsWrapper]r  Union[int, float]r   zIterator[str])r   r   r   r   )r   r   )r  r   )Fr   r   r   r   r   pexprr&  r$  r(  rg   rY  r_  r   r0   rf  rl  rq  rt  rE  r  rO  r  r  r>  rB  rj   r  r  r  r  r  r0  r/  r  r  r  staticmethodr  classmethodr   r   r   r  r   r%  r#  r1  r:  r>  rO  rS  rK  r[  rc  rg  r  r  ro  rx  r|  r  r  r  r  r  r  r  r   r   r   r   s   @r]   r   r   ~  s    */E&.&&!OT! /38<9=9=$)AD%AD %AD ,	AD
 (6AD )7AD 7AD "AD 
AD ADF%
 J  J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
	'
$>>':>	>':	(0
 L1$L1/ML1
L1 L1\ 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
  V5 V	 VD O$O 0O
 
 O O84
,7$1$1 
$1L

"<8 OS66/6GK6	6
 )*)3D)	) )&0  
(CJFP
60" r_   r   c                  H   \ rS rSr% Sr\rS\S'   S rS r	\	r
\	rS rS rS	 rS
 r S/   S0S jjrS r S1   S2S jjr  S3S jr\      S4S j5       rS5S jr    S6S jrS rSS.S jr    S7S jrS8S jr      S9S jrSSS.   S:S jjrS r S;           S<S jjrS r \!\"RF                  " S5      S=S  j5       5       r$\!      S>S! j5       r%\!      S?S" j5       r&\!        S@S# j5       r'\!  SAS$ j5       r(\!          SBS% j5       r)\!        SCS& j5       r*\!        SDS' j5       r+\!\,RZ                  R\                  S4   SES( jj5       r/\!\,RZ                  R\                  S4   SFS) jj5       r0S* r1SGS+ jr2 SH SIS, jjr3S- r4S.r5g)JSIMDSchedulingi  zc
Single Instruction Multiple Data parent class used for fusion across
multiple different backends.
z	type[Any]kernel_typec                &    [        S U 5       5      $ )Nc              3     #    U  H7  n[         R                  R                  R                  [	        U5      5      v   M9     g 7frW   r)  r   s     r]   r   *SIMDScheduling.group_fn.<locals>.<genexpr>  s-     P%QQWW%%..}Q/?@@%s   ?Ar{  r  s     r]   group_fnSIMDScheduling.group_fn  s    P%PPPr_   c                
  ^^^^ [        U[        R                  5      (       d  [        U[        R                  5      (       a  [        R                  R                  X5      $ UR                  u  nu  nmUR                  u  nu  mm[        X5      nUR                  5       (       a3  UR                  5       (       d  UR                  5       (       a  U" S5        OGUR                  5       (       a2  UR                  5       (       d  UR                  5       (       a  U" S5        UR                  5       (       a  UR                  5       (       a  UT:H  =(       a    TT:H  nU(       d  SSKJ	n  UR                  X5      nU(       d  U" SUTTT5        U(       a  UR                  5       (       d  UR                  5       (       aj  UR                  5       (       d  X!p!U R                  UR                  5       UT5      m[        UU4S jUR                  5        5       5      (       d	  U" S5        gU$ UR                  5       (       Gd  UR                  5       (       Gd  UT:X  a  TT:X  d  UR                  5       (       d  U" SUTTT5        gUR                  5        Hl  nUR                  5       (       a    OVUR                  5       UR!                  5       -  (       d  MB  UR                  u  nu  pXI:X  a  TU
:X  a  M`  U" S	UU	TU
5          g   X4 H  nUR                  5       (       d  M    g
   U R                  UR                  5       UT5      nU R                  UR                  5       UT5      nU R                  UR                  5       UR                  5       -   UT5      n["        R$                  R&                  (       a`  S
n[)        U5      S:  a)  [)        U5      S:  a  Xs=:H  =(       a    U:H  Os  nOX:H  nO[)        U5      S:  a  X:H  nU(       d  U" SUUU5        gg
UR                  5       (       d  UR                  5       (       a  TS:X  a  TS:w  d   eUTT-  :X  a  [        UU4S jUR                  5        5       5      (       d	  U" S5        g["        R$                  R*                  (       ag  UR                  5       (       dR  [-        U R                  UR                  5       U5      R/                  5       5      US4TTS44;   nU(       d  U" S5        U$ g
UT:w  a  U" S5        UT:H  $ UR                  5       (       a  UR                  5       (       a   eU R1                  X!5      $ )z
Hook called by Scheduler to determine if the Triton backend
can fuse node1 and node2.  These nodes might already be
FusedSchedulerNodes.
z&Split scan cannot fuse with reductionsr   )MixOrderReductionz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)c              3     >#    U  H6  n[         R                  TR                  5       UR                  5       TS 9v   M8     g7f)r  N)r   r   r   
get_ranges)r   n2rnumel1rA  s     r]   r   *SIMDScheduling.can_fuse.<locals>.<genexpr>8  s?       0 ,,' -  0s   >Az/invalid loop order and tiling for native matmulFz5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)z:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r=   c              3  p   >#    U  H+  n[         R                  TT4UR                  5       5      v   M-     g 7frW   )r   r   r  )r   r   numel2rnumel2s     r]   r   r    s3      . ,,fg->OO.s   36z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r  r   ForeachKernelSchedulerNodecan_fuser  r/   is_split_scanrt   torch._inductor.schedulerr  rH  select_tiling	get_nodesr  is_templateused_buffer_namesget_buffer_namesr   rZ    tiling_prevents_pointwise_fusionr   tiling_prevents_reduction_fusionr{  r   can_fuse_horizontal)ro   node1node2r  numel1whyreduction_can_fuser  r   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  r  rA  s                    @@@@r]   r  SIMDScheduling.can_fuse   s    eYAABBj977G
 G
 77@@NN${{FG${{FG%  )<)<)>)>!!##<=  ""5+>+>+@+@!!##<=E$6$6$8$8!'6!1!Hg6H%G%6%?%?%M"%G "&&((E,B,B,D,D --//#(5 ++EOO,=vwO  $oo/	   IJ %%!!##E,>,>,@,@f$G);((**O ! !& 1++--!  $557%:P:P:RR$59ZZ22I & 3:8M \ & ) ' * $)# !2& ^==?? $
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&<<W<&1\A%"-D6	 !!!##(:(:(<(<a<GqL00')) "__.   <= MMBB!--//05**5??+<fELLN1  !,1- 5:;4412V##!!##E,>,>,@,@@@''55r_   c           
       ^^^^^^^ / m[         [        R                     " 5       m[        5       m[        5       mS mUU4S jnUU4S jnU4S jnUUUU4S jn[        R                  UUUU4S j5       nUU4S jn	U H  n
U
T;   a  M  TR                  U
5        U" U
5      (       aT  U	" U
T5      (       a  U" 5           S S S 5        T(       a"  U" U
5      (       d  T=(       d    [        T5      mOS mU" U
5        M}  U" U
5      (       a#  U" 5          TR                  U
5        S S S 5        M  [        ST ST S	U
R                  S
    35      e   T$ ! , (       d  f       N= f! , (       d  f       M  = f)Nc                ~   > U R                   u  nu  p#UT:H  =(       a    UT:H  =(       d    UTT-  :H  =(       a    US:H  $ Nr=   r  r   r  
node_numelnode_rnumelrk   r  s       r]   fits_in_main_body@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sF    +,77(A(
%'AK6,A efn,A1Ar_   c                `   > U R                   u  nu  p#UT:H  =(       a    US:H  =(       a    TS:g  $ r  r  r  s       r]   fits_outside_reductionESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s2    +,77(A(
&K;!+;K!Kr_   c                d   > U R                   R                   H  nUR                  T;   d  M    g   gr  )read_writesreadsrh   )r   readcurrent_loop_buffer_usages     r]   expect_improved_memory_usageKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s,    ++99 99 , r_   c                  > TR                  U 5        TR                  U 5        TR                  U R                  R                   Vs/ s H  oR
                  PM     sn5        U R                  5       (       a  [        U [        R                  5      (       a|  [        U R                  [        R                  5      (       aS  [        U R                  R                  [        R                  5      (       d   TR                  U R                  5       5        g TR                  U R                  R                    Vs/ s H  oR
                  PM     sn5        g s  snf s  snf rW   )r   r   updater  r  rh   rt   r  r   rK  r   r   rL  dataScanget_namewrites)r   rS   r  donerJ  not_ready_yet_nodess     r]   schedule_node_in_loopDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-R>Qff>Q-RS
   q)"9"9::qvvr'8'899"166;;88#''

5)00!--BVBV1WBVQ&&BV1WX .S 2Xs   E6Ec               3  b  >#    T(       a  TS   [         L a  TR                  5         OTR                  [        5        T(       a1  TR	                  T[        5        TR	                  TS-   [         5        S mS v   TR                  [         5        TR                  5         T R                  5         g 7f)Nr  r=   )rF   popr   rE   insertclear)r  maybe_split_indexrJ  r$  s   r]   end_current_reduction_loopISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B,B/c                   > TS:X  a  gTU R                   -  (       d  gU(       a  [        US   [        [        45      (       a   e[	        T5      $ )Nr=   Fr  )	ancestorsr  rF   rE   r   )r   rJ  r$  r  s     r]   #requires_closing_previous_reductionRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sS    {&7 b!O5E#F* *   +,,r_   zunexpected group: (r   z) != r=   )
r   r   r-   r  r  r   r  r   rj  r  )ro   r   rk   r  r  r  r  r%  r,  r0  r   r  r#  r+  rJ  r$  s     ``       @@@@@r]   generate_node_schedule%SIMDScheduling.generate_node_schedule  sT   #%)5568 0:|5?\!+/		L		Y 	Y" 
	"	"	. 
#	.	- Dt|HHTN &&6t]KK35 6 -5QRV5W5W(9(OS=O% )-%%d+'--/1!((. 21 *)%6(%

1O - 4 ' 65 21s   <EE!
E	!
E0	c                    UR                   UR                  p2UR                  UR                  5       -  (       d"  UR                  UR                  5       -  (       a   eU R	                  X#5        g rW   )r  r  r/  get_operation_names_codegen_mix_order_reduction)ro   r   r  r  s       r]   codegen_mix_order_reduction*SIMDScheduling.codegen_mix_order_reduction  sW    zz4::u OOe&?&?&AAOOe7799	
 
 	))%7r_   c                    UR                  5       n/ n/ nU H<  nUR                  5       (       a  UR                  U5        M+  UR                  U5        M>     X44$ rW   )r  rt   r   )ro   r   r   
reductions	epiloguess        r]   #_split_mix_order_reduction_epilogue2SIMDScheduling._split_mix_order_reduction_epilogue  sX     
	D  ""!!$'  &	 
 $$r_   c           	        UR                   UR                  pTUR                  nU R                  UXES./USSSS.5      S   nUR                  (       d   eUR
                  (       d   eX'l        U R                  Xg5        UR                  R                  [        UR                  5      UR                  S   -  UR                  S   UR                  -   S-
  UR                  -  -  S	[        R                  S
9u  pn
U
S:X  d   SU
< 35       eU   UR                  5         SSS5        [         R"                  " 5       n[$        R&                  " U5         U   U(       a#  UR)                  [*        R,                  " SS95        UR/                  5       nSSS5        SSS5        U(       a)  WR1                  [3        [4        R6                  5      S5      nXyW4$ ! , (       d  f       N= f! , (       d  f       N[= f! , (       d  f       Nd= f)z
for_benchmark:
    True if the generated code is for benchmarking. We need make
    sure benchmark harness code is generated.
)rS   rT   NT)r4  r@  rD  rR  r   rT   rS   r=   F)rk  zws_off=)benchmark_kerneltriton_)rk   r  rJ  create_kernel_choicesrC  rD  rP  !codegen_node_schedule_with_kernelr  	workspacer  rQ  r:  rX   r  r  r  	ExitStackr<   set_kernel_handlerenter_contextr   patchr  replacer~   r4   KERNEL_NAME)ro   kernel_features
split_sizefor_benchmarkrk   r  rJ  rm   r  ws_namews_offstacksrc_codes                r]   -_generate_kernel_code_for_mix_order_reduction<SIMDScheduling._generate_kernel_code_for_mix_order_reduction  s    (--/N/Nv'55++()+!%'+15		
 	 ****))))'..}E $[[22//0mmE"#c"V%7%77!;@R@RRT ++ 3 
F {(wviL({!  $$&!!&)5##FLL$$GH,,.H ,1)
 
  ''K,C,C(DiPH(( V ,15))s0   GG6;G%G6
G"%
G3	/G66
HNc                    [         erW   ri  )ro   modn_spills_threshold
node_namess       r]   benchmark_codegened_module)SIMDScheduling.benchmark_codegened_moduleO  s
     "!r_   c                  ^ ^^!^" [         R                  R                  T5      u  m"n[        R                  R
                  R                  [        R                  " T"U5      5      (       d  T R                  UT5      $ UU"4S jnU" 5       n[        =R                  S-  sl        [        R                  R
                  R                  [        R                  " T"U5      5      (       d   eT R                  U5      u  pg/ nU HD  n	U	R                  5         U	R                  5       n
U
R                  5         UR!                  U
5        MF     T R#                  TR%                  5       U-   T"U5      n['        UT"U5      m![(        R*                  R,                  R.                  (       d  [,        R0                  R2                  ch  [,        R0                  R4                  (       d*  [,        R6                  (       d  [,        R8                  (       a  U!U 4S jn[:        R<                  " UUS5      nT R?                  T!USS9u  pn[A        US   RB                  RD                  5      n0 nU(       Gac  U GH  n	U	RG                  5       S   RB                  RI                  5       nU	RG                  5       S   RJ                  S   RB                  RG                  5       S   RB                  RI                  5       nUUU'   T R                   (       d   eT R                   RL                  RO                  U	RG                  5       S   RJ                  S   RB                  RI                  5       5        [        R                  RP                  RO                  U5        GM!     URR                   H.  nURU                  URV                  URV                  5      Ul+        M0     T RY                  XU5      nUUl-        []        U5      Ul.        [        R^                  " U5         T!Ra                  5        HD  nURG                  5       S   RB                  RI                  5       U;  d  M4  URc                  5         MF     S S S 5        [        R                  Rd                  Rg                  S5        T Ri                  US 5        URk                  URZ                  SS	9  [        R                  =RP                  URP                  -  sl(        [        R                  =Rl                  URl                  -  sl6        [o        U5      [o        URR                  5      :X  d   e[        R                  Rd                  Rq                  T"U-   S-
  U-  5      n[s        URR                  5       GH  u  nnURV                  nU S
U 3nU S
U 3nSU SU 3nSSS.nURU                  URt                  URt                  5      nU SU SU SU SU SU SU S3n[        R                  Rw                  U5      =n [(        Rx                  :w  a	  USU  S3-  n[        R                  Rd                  R{                  U5        [        R                  Rd                  R|                  RO                  U5        GM     UR                  5         U(       a  T R                  U5        T R                  5         g ! , (       d  f       GNj= f)Nc                 t  > [         R                  R                  b  [         R                  R                  $ [        R                  " TR                  5       5      n U R                  nUS-  n[        R                  R                  R                  T5      n[        [        X2-  5      S5      n[        US5      nU$ )N         )r   rZ   mix_order_reduction_split_sizer)   create
get_devicemulti_processor_countr<   r   r   r   r  r+   min)device_propnum_smestimated_num_splits
numel_hintrK  r  rk   s        r]   _pick_split_sizeESIMDScheduling._codegen_mix_order_reduction.<locals>._pick_split_sizeZ  s    }};;G}}CCC +11%2B2B2DEK 66F#)A:  ))33E:J_Z-OPRTUJZ-Jr_   r=   c                   > TR                  TU SS9u    p[        R                  " U5      nTR                  U5      u  pAU$ )NTrK  rL  )rQ  r    loadrW  )candidate_split_sizer  rP  rT  msrJ  ro   s        r]   _bench;SIMDScheduling._codegen_mix_order_reduction.<locals>._bench  sO    !%!S!S#3"& "T "1
 "&&x077<	r_   r[  Frj  r   z!# Call mix order reduction kernel)rf  z * (z + 1) * aminamax)rb  r  z = r  z : z].view(r   z).z(dim=0)z.to(r   )Br   r  get_numel_rnumelr<   r   r   evaluate_exprr   Gtr6  r   r7  r<  cancel_reduction_splitextract_pw_from_reductionswap_pw_red_dimensionr   r2  r  rI   rX   rY   r   deterministicrZ   r^  'mix_order_reduction_autotune_split_sizemax_autotunecoordinate_descent_tuningr(   autotune_single_fieldrQ  r   r   _split_sizeget_outputsr!  usersremoved_opsr   removed_buffersrQ  r   r"  define_kernelr)  r   rE  scheduler_nodesmark_runr^  make_commentcodegen_commentrg  inplaced_to_remover  codegen_python_sizevarr  r#  r  r  	writeline	allocatedrc  _codegen_nodesfree_buffers_in_scheduler)#ro   r  r  r  rg  rK  node2_reductionsnode2_epilogueconverted_nodessubnode	convertedrJ  rn  rm   rM  rP  is_split_reductionrenamebufnameusernamepartial_accumr)  r   nsplitr  r"  
stride_strr\  endreduction_type2opopnamefinal_reducebuffer_dtyperJ  rk   s#   ``                               @@r]   r6  +SIMDScheduling._codegen_mix_order_reductionT  s   !33DDUKvww--ehhuf.EFF44UEBB	  &'
 	++q0+ww--ehhuf.EFFFF ,0+S+S,
( 'G**,99;I++-""9-	 (
 33OO/
 -]E6J &&44<<DEE&&33 '<<J %)$V$V! %W %
! ""21"5":":"F"FG+!--/277@@B'')!,U1T++-+ T((*	  #+w~~%~**..'')!,2215::CCE ''++G4 , "(!@!@,2JJ!--}/H/H-) "A
 ((&I($X.!!&)'779 ##%a(--668FMMO : * 	
))*MN]D16--UC	6#9#99	""f&?&??" ?#s6+J+J'KKKK%%<<Z!#
2
 #,F,K,K"LC'33K"83vh/Je3zl+EcU(:,/C! '**,,m.J.JF *]#gYawc#gfXUWX^W__abhaiipqL !" 1 1+ >>5;;N$|nA 66GG  **<8 GG  **..{;- #M0 	$$&/&&(c *)s   A[[
[c                r   U R                   (       d   eU Vs/ s H.  o3R                  5       U R                   R                  ;  d  M,  UPM0     nnU(       d  g [        US S9R                  u  nu  pVU R                  XU5      n[        R                  SU5        U R                  [        XuXb5      5      $ s  snf )Nc                4    [        U R                  5       5      $ rW   r   rt   rS   s    r]   r   /SIMDScheduling._codegen_nodes.<locals>.<lambda>  s    c!..:J6Kr_   r   zSchedule:
 %s)
r   r!  r  r  r  r2  schedule_logdebugcodegen_node_schedulerI   )ro   r   coalesce_analysisr   r  rk   r  rJ  s           r]   r  SIMDScheduling._codegen_nodes  s    
 ~~~"
"TmmoT^^=W=W&WDU 	 
  ,KLRR?E33E&I+];))}VO
 	

s   +B4B4c                >   U R                   (       d   eUR                  5        Vs/ s H/  nUR                  5       U R                   R                  ;  d  M-  UPM1     nn[	        U5      S:X  a  g[
        R                  R                  R                  R                  (       af  [	        U5      [	        WR                  5       5      :w  a4  U R                   (       d   e[         R                  " U R                   U5      n[        U5      nOSnU R                  X#5      $ s  snf )z;
Given a set of pre-fused nodes, generate a Triton kernel.
r   N)r   r  r!  r  r  rX   rY   r   rZ   coalesce_tiling_analysisFusedSchedulerNoder   r  )ro   r   r   r  s       r]   codegen_nodeSIMDScheduling.codegen_node  s     ~~~ (
(}}dnn&@&@@ ( 	 

 u:???!!((AA5zS!122~~%~ 33DNNEJ 9$ ? $""5<<!
s   ,DDc                :   [         R                  " [         R                  5      R                  n[	        U 5      (       d  gU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     nnU H  nUR                  5       (       a  M  [        U[        R                  5      (       d  M;  UR                  5       nUU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     sn-  nM     [        S U 5       5      (       d  g[        R                  R                  R!                  X5        U H,  n[        R                  R                  R!                  Xb5        M.     gs  snf s  snf )NFc              3  8   #    U  H  n[        U5      v   M     g 7frW   )r1   )r   r  s     r]   r   8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr><  s     FID)$//Ird  T)rX   iinfoint32r  r1   has_tensor_outputr  storage_sizer  r   MutationOutputget_mutation_buffersr  r<   r   r   	check_leq)rk   buffersint_maxr  	buf_sizesmutated_bufsr  s          r]   can_use_32bit_indexing%SIMDScheduling.can_use_32bit_indexing  sG    ++ekk*..%e,, 
$$& ,CNN))+ 	 
 C((**z#r?P?P/Q/Q"779++,,. 4CNN$113+ 	  FIFFF 	
""52DGG&&t5 /
s   F!"FF4"Fc                   UR                   nU R                  UUR                  UR                  UR                  5      u  p4U R                  UU/XS.5      nU H  nU R                  X&5        M     [        R                  " U5        U Hp  n[        R                  " U5         UR                  5       nSSS5        U R                  WX&5      n[        R                  SU5        Xl        [!        U5      Ul        Mr     A[#        U5      S:  a  [        U5      n	OUu  n	[        R                  " U	5         UR%                  5        H  n
U
R'                  5         M     SSS5        U V
s/ s H  n
[)        U
[*        5      (       d  M  U
PM     nn
U R-                  XR                  5        [.        R0                  R2                  (       a\  [        R4                  R6                  R9                  5         [        R4                  R6                  R;                  U	R                  U5        U	R=                  U	R                  5        [.        R0                  R2                  (       a(  [        R4                  R6                  R?                  5         [.        R@                  (       a  U	RC                  5         [.        RD                  (       a  U	RE                  US   R                  5        [        R4                  =RF                  U	RF                  -  sl#        [        R4                  =RH                  U	RH                  -  sl$        [        R4                  R6                  RJ                  (       a  [.        RL                  (       a  US   RN                  RQ                  5       nUR%                  5        H  n
U
RS                  5       nX;  a  M  U
RT                  c   eU
RT                  RW                  5       nUc  MH  [X        S   S==   S-  ss'   [        R4                  R6                  R[                  SUR\                  < S	U S
35        M     U R_                  5         g! , (       d  f       GN= f! , (       d  f       GN= fs  sn
f )z,
Generate code for nodes in kernel_features
)r4  r@  Nz+Generating kernel code with kernel_name: %sr=   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )0rJ  get_tiling_and_scoresrk   r  r  rA  rB  rC   merge_workspaces_inplacer<   rE  r  r  r  r  r)  r   r  r  r  r  r-   r  r   cppenable_kernel_profiler   r^   write_kernel_context_guard_beginwrite_kernel_context_guardrg  write_kernel_context_guard_endnan_assertsr[  r  r  r  supports_intermediate_hooksgenerate_intermediate_hooksr  live_output_buffersr!  r   get_origin_noder   r  rh   r  )ro   rJ  rJ  rA  tiling_scorekernelsrm   rP  r)  final_kernelr   base_scheduler_nodes	live_outsrh   origin_nodes                  r]   r  $SIMDScheduling.codegen_node_scheduleF  sV    (55#99!!++--	 
 ,,H(H

 F22=I ,,W5F%%f-!002 .,,X}MKIIC[Q!,(2F   w<!&w/L%O\!!,/'779 : 0 + 
*Tj?P.QD] 	  
 	13K3KL::++GG  AACGG  ;;(($ 	  !9!9:::++GG  ??A**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779}}(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO : 	&&(y .- 0/
 
s$   "Q3(Q'Q1Q1
Q	
Q.c                (    U R                   " U0 UD6/$ rW   )r  )ro   rJ  kernel_argskernel_kwargss       r]   rA  $SIMDScheduling.create_kernel_choices  s'     
 	
r_   c           	     ^   U   [         R                  " 5       n0 nU H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  UR                  5         UR                  UR                  5       5      nUR                  [        R                  UR                  R                  U5      R                  5       5      5        M     UR!                  UR#                  5       5        U H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  [%        UR                  5        UR                  UR                  5       5      nUR'                  U5        M     S S S 5        g ! , (       d  f       g = frW   )r  rD  rE   rF  r  rF   closedecide_inplace_updater%  r  r  r  fromkeys_bodyindexing_from_argsr   r  keysr'   r   )ro   rJ  rm   rO  all_indexingr   r   s          r]   rB  0SIMDScheduling.codegen_node_schedule_with_kernel  s>   ((*EL &++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN & $$\%6%6%89 &++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL, &- VVs   FF
F,Fonly_gen_src_codec               "
   0 nUR                  5       n/ n	U H  n
U
R                  5       nU	R                  U
5        X-  (       d  M/  [        U5      S:X  d   eX[	        [        U5      5      '   UR                  R                  [	        [        U5      5      5        / n	M     [        U	5      S:X  d   eU   U(       d  U/UQ H  nUR                  5         M     U" 5       nUR                  5       n[        U5       H  nUR                  U5      nUR                  U5         U H1  nUR                  UR                  UR                  5       5      5        M3     UR                   R#                  [%        5       5        SSS5        M     UR&                  R)                  5        GH@  u  nnSU S3nUR+                  UR-                  5       / 5      =n	(       d  M6  [/        S U	 5       5      n[0        R2                  " SU(       + 5         UR                  U5         U	 H  n[        UR                  5       5      S:X  aB  [        U	5      S:X  a3  [5        U5      (       a#  U=R6                  UR                  5       -  sl        UR                  UR                  UR                  5       5      5        M     UR                   R#                  [%        5       5        SSS5        SSS5        GMC     SSS5        [8        R:                  " U5         [=        W[>        5      (       d]  [@        RB                  RE                  URF                  RH                  5         URK                  S5        SSS5        URK                  S	S
S9  UR&                   H  nSU S3nURK                  US
S9  M     UR                  5       n[        U5       H%  nUR                  U5      nURK                  U5        M'     [=        U[>        5      (       a  UnOURM                  5       n/ UQUPUQn[0        RN                  (       aH  URQ                  5       S-  nURS                  5        SU SURU                  U5      RW                  5        3nU(       a  UsSSS5        $ U RY                  UUU5      Ul-        UsSSS5        $ ! , (       d  f       GM  = f! , (       d  f       GN= f! , (       d  f       GM:  = f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       g= f)z;
Helper method to codegen a single template kernel variant
r=   r   Nz<LOAD_INPUT_rW  c              3  @   #    U  H  oR                  5       v   M     g 7frW   )can_codegen_without_upcasts)r   p_ns     r]   r   :SIMDScheduling._codegen_single_template.<locals>.<genexpr>  s      5ESc7799^   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  ).r  r  r   r  r   iterprologue_fused_inputsr   r  r_  rangerY  set_subgraph_bodyr   r%  r  cse
invalidater   named_input_nodesrz   r   r!  r  r   rG  r   #prologue_fused_inputs_preserve_zeror<   rE  r  r~   r   r&   current_originsr   originsfinalize_hookfinalize_remainingr?  r  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r)  )ro   rm   rendertemplate_nodeepilogue_nodesprologue_nodesr  buf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_codenum_store_subgraphsrX  subgraph_name
input_namebuffercan_codegen_without_upcastprologue_noderP  rJ  num_gbs                           r]   _codegen_single_template'SIMDScheduling._codegen_single_template  sj    &("&88:&H--/E!!(+%%5zQ&@N4U+<=,,00d5k1BC!# ' >"a'''$ +<^<DMMO = "8L"("?"?"A./ & F Fq I--m< .V%@%@AR%ST !/JJ))*,7 =< 0 '-&>&>&D&D&F"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W $55mD1?$'(F(F(H$IQ$N(+N(;q(@'CM'R'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!" 2@ #JJ11*,?! E  'G# p !!&)lC00YY..}/A/A/I/IJ ..~> K**;u*E %66
".zl! <**=*G 7
 #)"?"?"A./ & F Fq I**=9 0
 ,,,' (::<MnMmMnMM&&99;cA::<=Rj66v>GGIJL  !M *)P "&!3!3HmV!TFU *)Y =<& ED 9 Vv KJ *)s   3A+SAR$9AS.SS	B:R7	S	SA	T S.DT T $
R4.	S7
SS		
S	S
S+.
S=	8T  
Tc                (  ^^ SSK Jm  U4S jm/ n[        UR                  5      U/-    H[  n[	        U[        [
        45      (       a&  UR                  [        U4S jU 5       5      5        MD  UR                  T" U5      5        M]     [        U5      $ )Nr   r%   c                   > [        U T5      (       d  g [        U [        R                  5      (       a  U R                  5       n U R	                  5       =nc  g [        S U 5       5      $ )Nc              3  $   #    U  H  ov   M     g 7frW   r   r   s     r]   r   KSIMDScheduling._get_multikernel_shapes.<locals>.get_size.<locals>.<genexpr>R  s     )DqDs   )r  r   BaseViewunwrap_viewmaybe_get_sizer{  )r	  r  r&   s     r]   get_size8SIMDScheduling._get_multikernel_shapes.<locals>.get_sizeK  sX    c6**#r{{++oo'**,,5)D)))r_   c              3  4   >#    U  H  nT" U5      v   M     g 7frW   r   )r   _argr  s     r]   r   9SIMDScheduling._get_multikernel_shapes.<locals>.<genexpr>W  s      @CD$Cs   )r   r&   r  inputsr  r{  r   )ro   r   r  r	  r&   r  s       @@r]   _get_multikernel_shapes&SIMDScheduling._get_multikernel_shapesF  sr     	 	* $v-C#e}--

5 @C @@A

8C=)	 .
 Szr_   c                H    U R                  U5      n[        S U 5       5      $ )Nc              3  F   #    U  H  n[        S  U 5       5      v   M     g7f)c              3     #    U  HE  n[        U[        R                  5      =(       a    [        U[        R                  5      (       + v   MG     g 7frW   r  r   Exprr  r   s     r]   r   FSIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>.<genexpr>_  s8      A 1ejj)N*Q2N.NNs   AAN)r8  )r   shapes     r]   r   <SIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>^  s2      

  	      s   !)r  r8  )ro   r   shapess      r]   _kernel_has_dynamic_shapes)SIMDScheduling._kernel_has_dynamic_shapes\  s.    --d3 

  
 
 	
r_   c                P   ^ U R                  U5      n[        U4S jU 5       5      $ )z[
Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
c              3  N   >#    U  H  n[        U4S  jU 5       5      v   M     g7f)c              3     >#    U  HG  n[        U[        R                  5      (       a!  [        U[        R                  5      (       d  TOUv   MI     g 7frW   r  )r   r   hints     r]   r   ASIMDScheduling._make_shape_cache_key.<locals>.<genexpr>.<genexpr>n  sG       A a,,Z5==5Q5Q  s   AANr  )r   r"  r*  s     r]   r   7SIMDScheduling._make_shape_cache_key.<locals>.<genexpr>m  s5      
     	    s   "%)r  r{  )ro   r   r*  r$  s     ` r]   _make_shape_cache_key$SIMDScheduling._make_shape_cache_keyf  s1     --d3 
  
 
 	
r_   r  hint_overridec          
        UR                   u  nu  pxUS:X  d   e[        UR                  [        5      (       Ga#  UR                  R                  (       Ga  [        UR                  R                  5      S:  Ga  U R                  UR                  5      (       Ga  0 n	/ n
UR                  R                  R                  5        H  u  nnU" UR                  US9u  pU(       a>  U R                  UUUUUSS9n[        U[        5      (       d   eU
R                  U5        M^  Uc  Mc  U R                  UUUUUSS9nUc  SOU R                  UR                  U5      nXU'   M     U(       a  SR                  U
5      $ [        R                  " [        U	R!                  5       5      5        [#        U	5      n/ UQUPUQnU R%                  UUR&                  5        UR)                  UR&                  5        [*        R,                  =R.                  UR.                  -  sl        [*        R,                  =R0                  UR0                  -  sl        U R3                  5         gUR                  R5                  UR                  US9u  pU(       a  U R                  UUUUUSS9$ U R                  UUUUUSS9n/ UQUPUQnU R%                  UUR&                  5        UR)                  UR&                  UR                  5        [*        R,                  =R.                  UR.                  -  sl        [*        R,                  =R0                  UR0                  -  sl        U R3                  5         g)z
Codegen a triton template with multi-kernel dispatch support

If `only_gen_src_code=True` the src code will be returned instead of being
codegenned into the wrapper
r=   )r0  Tr  NFz

)r  r  r   r   _make_kernel_rendersr  r%  rz   r  r~   r   r-  r  rC   r  r  r   rD   r  r)  rg  r<   r   r  r  r  make_kernel_render)ro   r  r  r  r  r0  r  _numelr  r  	src_codesr   r3  rm   r  rP  shape_cache_keymulti_kernelrJ  s                      r]   codegen_templateSIMDScheduling.codegen_templatew  sD     ,11F{{ }))+>??""777M&&;;<q@//0B0BCCGI
 ##88>>@"!3!&&m" %#<<%&&*.  =  H &h4444$$X. ( !::%&&*/ ; F %, !778J8JIV $
 06O,C AF !{{9--00gnn6F1GH.w7LMnMmMnMM  0H0HI$$\%=%=>GG##|'C'CC#GG&&,*I*II&**,*//BB""- C NF !44!""&* 5   66!""&+ 7  !R. Q- Q. Q$$]F4F4FG""6#5#5}7I7IJ''6+A+AA'**f.G.GG*..0r_   c                    [         R                  R                  R                  [         R                  R                  R                  5       5        g rW   )r<   r   r^  r  
device_opssynchronizers   s    r]   codegen_syncSIMDScheduling.codegen_sync  s-    	&&qww'9'9'E'E'GHr_   c           
        SSK Jn  U Vs/ s H  owR                  5       PM     nn0 0 p[        X5       Hl  u  p[	        US S9R
                  u  nu  pU R                  XU5      nU R                  UX5      nUUX4X'   UR                  U[        UX5      U(       + S9X'   Mn     UR                  UU UU	U
S9n[        R                  S[        U5      U Vs/ s H  n[        U5      PM     sn5        / nU GH)  n[        U5      S:X  a  M  U" UUS	9nU H  nU R                  X   S   UR                  X   5      5        X   nX   S   nU(       dL  [         R"                  " U5         [$        R&                  " U5       H  nUR)                  5         M     S S S 5        [         R*                  =R,                  UR,                  -  sl        [         R*                  =R.                  UR.                  -  sl        M     UR1                  5       nUR3                  UUU45        GM,     U$ s  snf s  snf ! , (       d  f       N= f)
Nr=   )ComboKernelc                4    [        U R                  5       5      $ rW   r  r  s    r]   r   ;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>      #ann>N:Or_   r   )r4  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsr   )enable_autotunemixed_sizes)triton_combo_kernelr@  r  r  r  r  r2  r  create_triton_kernelrI   horizontal_partitionr  r  r  rB  create_sub_kernelr<   rE  rH   r  r  r   r  r  r  r   )ro   subkernel_nodescustom_part_algorithmrI  rJ  r  r@  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  rk   r  rJ  rA  
partitionspkernel_code_list
node_grouprm   	subkernelrP  s                            r]   generate_combo_kernel_code)SIMDScheduling.generate_combo_kernel_code  s6    	59HINN,I+-r(_?IB!$U0O!P!V!VA 77fMM''uEF$165$H! + @ @+M5I"-o !A !M @ !55!"2$+ 6 

 			? '(ZSVZ(	

 $J:!#  /'F
 !66%)!,,,]->? *-	 1 5a 8(--i8$6$A$A-$PD MMO %Q 9 ''9+D+DD'**i.J.JJ* ! ,,.H##Xvz$BC/ %0  e J. )& 98s   H%H*&.H//
H=c                   UR                  5       nUR                  nUR                  n[        R                  S:  =(       d    [        R                  S:H  =(       a    UnU R                  X#XE5      nU Hu  u  pxn	U R                  Xq/U5      n
U R                  UR                  U
5        [        R                  SU
5        UR                  [        R                  R                  U
5        Mw     U R                  5         g )Nr=   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algorI  r   combo_kernel_allow_mixed_sizesrZ  r  r  snodesr  r  rg  r<   r   r^  r  )ro   combo_kernel_noderO  rP  rI  rJ  rW  rP  rm   r  r)  s              r]   codegen_combo_kernel#SIMDScheduling.codegen_combo_kernel'	  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::O
 $4Ha,,X7JFSK  !2!9!9;GII:KHqww33[A	 $4 	&&(r_       c           
       ^ ^^
 TS:H  nSU UU
4S jjnUR                  5       u  nm
[        U5      S::  a  [        T
5      S::  d  [        UT
-   5      (       a  / $ UR                  5       u  nm
U" UU(       a  UOT
UR                  U5      5      nU Vs/ s H=  n[	        T R                  UR                  UT5      UR                  UR                  S9PM?     n	nU	$ s  snf )Nr=   c                  > [        UR                  5      [        U5      :X  d   SUR                  < SU< 35       eUR                  UR                  /n[	        S [
        R                  R                  U5       5       5      (       d   e[
        R                  R                  U5       Vs/ s HF  nUR                  [        R                  R                  ;  d  M-  [        U[        5      (       d  MD  UPMH     nn[        UR                   Vs/ s H  oDR                  PM     sn5      nSS jn[        TR!                  U" U5      /U 5      SSS9/nU GH  n[        R                  R"                  R%                  UR&                  UR                  5      n	[        U	5      [        U5      :X  d   e U	R'                  S5      S-   n
U
[        U5      :X  a  M  [	        S	 XS
  5       5      (       a  M   U" US
U
 5      U" XS
 5      4n[        R                  R"                  R+                  [-        S [/        X5       5       5      5      nUR                  U;   a  US-  n[        R1                  US   5      (       a  US-  n[        R1                  US   5      (       a  US-  n[        R                  R"                  R+                  U[-        [
        R                  " UT5      5      -
  5      S:  d  GM  UR3                  [        TR!                  U" US
U
 5      U" XS
 5      /T5      UUR                  S95        GM     U$ s  snf s  snf ! [(         a     GM  f = f)z@
Compute tiling candidates by dividing up the iteration ranges.
zrw.range_vars=z ranges=c              3  N   #    U  H  n[        U[        [        45      v   M     g 7frW   )r  r!   r"   )r   r  s     r]   r   HSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>K	  s&      EC 3G 455Es   #%c                f    [         R                  R                  R                  [	        U 5      5      $ rW   r)  )r  s    r]   collapse_rangesNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesW	  s"    ww''00v1FGGr_   noner   )rA  rh   scorer=   c              3  *   #    U  H	  oS :H  v   M     g7fr  r   r   s     r]   r   rh  m	  s     ;?a6?s   Nc              3  :   #    U  H  u  pUS :w  d  M  Uv   M     g7fr  r   )r   r  r  s      r]   r   rh  |	  s      "1EST1Es   	r   rA  rm  rh   )r  r  r   r   )r  
range_varsr  r"  r  r;  r  r  rh   r<   r   r  r  r!   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r7   r  is_good_sizer   )is_pointwiser  rwdep_sourcesr  depswrite_namesrj  tilingsrv  splittiled_groupsrm  r  r  reduction_rangess                r]   tile_ranges5SIMDScheduling.candidate_tilings.<locals>.tile_ranges@	  s#    r}}%V4S8H	&6SS4 88RYY/K $??88E     %??88EEC88177#:#::  sI. E   %"))%D)3hh)%DEKH
  44(01<  G ''**77		2==Q7|s6{222
#MM!,q0EF+ ;76?;;; ! < $F6EN3#F6N3  ((22! "14V1E" 
 88{*QJE"//Q@@QJE"//Q@@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F6N$C!" !0$ #(!$
Q l N[ &E: " s0   ,,MM3MM1#MM
MMrp  )rw  r   r   list[CandidateTiling])	r  r  r   "pointwise_or_reduction_read_writesrr  complete_partial_tilingrA  rm  rh   )r  r   rk   r  rw  r  pointwise_rangespartial_tilingsrA  full_tilingsr  s   `  `      @r]   candidate_tilings SIMDScheduling.candidate_tilings;	  s     '!+\	 \	| .2__->** !Q&$%*$%58H%HIII .2__->**% ,2B33LA
 *	
 * 22MM5/ ll[[ * 	 	
 	
s   ACc                    / SQ[        U5      * S nSS/S[        U5       n[        / [        X15      Q[        XB5      Q5      $ )z;
Create a tiling dict from pointwise and reduction splits.
)rQ   rR   rS   NrT   rU   )r  r   r  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r]   create_tilingSIMDScheduling.create_tiling	  sT     &s9~o&78#U^,Cc2B.CDVc+)VC0B,UV
 	
r_   c                R    U R                  U(       a  UO/ U(       d  U5      $ / 5      $ rW   )r  )r  rA  rw  s      r]   rs  $SIMDScheduling.create_partial_tiling	  s0       "F&F
 	
,.
 	
r_   c                    [        UR                  5       5      nSU;   nX#-  nU[        U5      -  /nU(       a  XG4OXt4nU R                  " U6 $ )zR
Given a tiling for only pointwise or reduction dimensions, adds the missing one.
rS   )r  r   r7   r  )	r  rA  rk   r  splitsrw  total_numelmissing_tilingtiling_argss	            r]   r  &SIMDScheduling.complete_partial_tiling	  s^     fmmo&f}-%f(==> )5V$>:R 	   +..r_   c           
     
   US:H  n[         [        [        [        R                  4      " 5       n[
        R                  " U5       GH   n[        U[        R                  5      (       d  M%  UR                  5       nU(       d  [        US   5      S:X  a  MP  Xt(       a  SOS   nU/n	UR                  R                  5        V
s/ s H7  n
[        U
[        5      (       d  M  [        U
R                  5      S:  d  M5  U
PM9     nn
U GH  n
/ U
R                  R!                  5       Qn[        R"                  R$                  n[&        R(                  R*                  nSn[-        U5       H(  u  nu  nnUU-  nUnUR/                  X5      (       d  M(    O   UR1                  X5      (       d  M  US-   nU(       a  USU OUUS n/ nU H  u  nn[2        R4                  " U
R6                  U5      n[9        SUR;                  [<        5      UR;                  [>        5      -   [        U5      5      n[2        R@                  " UUUU5      nUb  US   OU/nURC                  U5        M     U Vs/ s HN  n[&        R(                  R*                  R1                  U[        R"                  R$                  5      (       a  ML  UPMP     nn[        U5      S:  d  GM  U	RE                  U5        GM     U	 H{  n[9        S[        U5      [G        S5      -
  5      nUS-   n[I        USU 5      nU4[K        UUS 5      -   n URM                  U RO                  U RQ                  U U5      UU5      5        M}     GM#     [S        U[        SS9n!U!$ s  sn
f s  snf )z
Creates N-dimensional tiling candidates, attempting to simplify loads/stores
by tiling the kernel into higher dimensions.

Returns a list of tilings ranked by dimensionality.
r=   r   Nr   T)r   reverse)*r   r   r~   r   r   rF   r  r  r   rK  r  r  r  reads_and_writesr!   r  rz   r   r   r<   r   r   r  statically_known_geqr   r>   get_subexpr_involving_symbolr   r  r<  r   r   match_mod_div_block_exprr  r   r^   r7   r{  r   r  rs  rV  )"r  rJ  pointwise_numelr  rw  r|  r   node_rangesranges_to_tilenode_tilingsr  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxr  _varrk   reduction_start_idxrj   index_tilingvarr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss"                                     r]   get_nd_tilingsSIMDScheduling.get_nd_tilings	  s    '!+^CO<=?#**=9DdI$;$;<< //+KCA$71$< )lBN*+L  ++<<>>Cc9- 25cjj/A2E >  
 # "73::#3#3#5!6',ww{{$77++$%!*3N*C&C$(E1((+%44,   +D  77(   '8!&;# $ ##7$78'(;(<=   "",JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-% #-.  , +77++CCCU +    |$q( ''5w #|  ,#&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''  ,i :J  
 ur s   M;M;9M;%AN 4N c                  ^^^^^^^^^^^^^ TR                   (       d  SOTR                   R                  mTR                  R                  mTR                  R                  mTR                  R
                  nT Vs/ s H  oeU   PM	     snmT Vs/ s H  oeU   PM	     snm[        R                  " [        R                  R                  R                  [        R                  S9n[        R                  " U" [!        T5      5      U" T5      :H  UUU4S j5        [        R                  " U" [!        T5      5      U" T5      :H  UUU4S j5        0 m/ n   S       SUUUUUUUUU4	S jjjn	UR#                  U	" SS9U	" SS945        T(       a  UR#                  U	" T4SSS	9U	" SS945        TTR$                  R'                  5       -  n
U
 H   nUR#                  U	" U4SS9U	" SS945        M"     [)        S
S9S
:X  a@  TS:X  a:  [*        R,                  " U
S5       H  nUR#                  U	" USS9U	" SS945        M!     / nU H^  u  u  pu  nn[/        U R1                  X5      [3        U5      [3        U5      -   S9nU R1                  UU5      nUR#                  UU45        M`     U R1                  T/T/5      nSmSm[3        TR4                  R7                  5       5      mUUU4S jn[9        UUS9 H  u  nnU R;                  TTTUR<                  5      (       d  UR<                  U:X  a  [?        UR<                  5      TS:X  a  SOS-
  nU[)        S
S9:  aE  [@        RC                  SU[        RD                  R                  RF                  RH                  5        M  UR<                  U4s  $ UR<                  U:X  d  M  UR<                  U4s  $    US4$ s  snf s  snf )zb
Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
Nr   c                    > T ST ST  3$ Nr   r   )rJ  r  	pw_rangess   r]   r   8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    ykO#4B}oFr_   c                    > T ST ST  3$ r  r   )rJ  
red_rangesr  s   r]   r   r  
  s    zl"_$5RGr_   Fc                  >	 U(       a  TOTnU(       a  TOTnU(       d  U(       a  U// 4$ / / 4$ [        U 5      X4nTR                  U5      =n(       a  U$ U(       a  TOTn/ n/ n	Sn
Sn[        Xs5       GH  u  pX;  a"  X-  n
TR                  R                  US5      nM-  U(       a  UT:X  a  TR                  nUc   eUR
                  n[        XR
                  5      nUR                  U
U-  5        U	R                  UR                  5        UR                  U5        U	R                  TR                  R                  US5      5        Sn
SnM  X-  n
UR                  U
5        U	R                  TR                  R                  US5      5        Sn
GM"     U
S:w  d  U(       a1  [        U5      S:X  a"  UR                  U
5        U	R                  U5        [        [        U5      5       HQ  n[        R                  R                  R                  UU   SS9n[        US5      n[!        U	U   U-  S-  5      U	U'   MS     X4TU'   X4$ )zE
Generate a tiling, and a tiling score, given vars to use as splits.
r=   r   rd  r   r[  )r  r   r  coalesced_by_varsuggested_splittiling_factorr   r   rm  r  r  r<   r   r   r   rb  r   )vars_to_useuse_split_varrw  r  target_numelr   r  splitting_varsr  split_scoresprodprev_var_coalesced_scorers  v_range
var_tilingtile	remainderrX  r   all_iter_varsall_red_varsr  r  r  r  r  scored_sub_split
tiling_vars                      r]   process_node_varsASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars
  s<    #/YJF.:?L)NB//8O$mBC&**3//s/
.:]NFLD'($ ".9
'OD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ; :> qy\c&kQ.>d###$<= 3v;'GG$$..vay2.F1I"%l1o&9A&=">Q (
 &,$:S!))r_   T)rw  )r  rw  r   r\   r=   r   )rm  gffffff?gGz?c                   > SnU S   R                   R                  5        H)  n[        R                  U5      (       d  UT-  nM$  UT-  nM+     TS-  nU S   R                  U-   * U-  $ )Ng      ?r   g?)rA  r   rr  rv  rm  )rR  score_factor	tile_sizeuncoalesced_penalty"bad_size_additional_tiling_penaltygood_size_tiling_penaltytotal_uncoalesceds       r]   	score_mod9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod  ss    LqT[[//1	&33I>>#/2T#TL#/2J#JL	 2 #4d":qTZZ"556EEr_   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r  ztuple[sympy.Expr, ...]r  r   rw  r   r   ztuple[list[int], list[int]])%r  r  norm_read_writesr   reduce_varsrj   r   partialr<   r   r   r   r   r   rX   _checkr7   r   r  r  r^   r;  combinationsrr  r  re  uncoalesced_addrsr   rV  tiling_is_compatiblerA  r  perf_hint_loginforY   rZ   r[   ) r  rJ  r  r  r  r  rs  get_hintscore_splitr  overlapping_iter_varsr  r|  pw_splitpw_score	red_split	red_score	candidater  default_tilingr  cand
tiling_lenr  r  r  r  r  r  r  r  r  s     ````                  @@@@@@@@@r]   compute_tiling_strategy&SIMDScheduling.compute_tiling_strategye
  s    %44 "2266 	 *::EE(99EE"33>>(561AY6	)56AQi6
 $$GG&&1P1P
 	]9-.(?2KKF	

 	]:./8O3LLG	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 'A%qd>%59 ' #q(_-A(556KQO"")+DI)u=  P RT<G8 X"89i'!!(6(mc)n4I ,,XyALNNI|45 =H **O+<>OP .3*#(  1 C C J J LM	F #)i"@D,((!?OT[[  ;;.0 !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00/ #A2 t##O 76s   =OOc                `   ^^ [        T[        5      (       d   e[        UU4S jU 5       5      $ )Nc              3     >#    U  HW  n[        U[        R                  5      (       d  M$  [        R	                  TR                  5       UR                  5       TS 9v   MY     g7fr  )r  r   rK  r   r   r   r  )r   r   r  rA  s     r]   r   6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>L  sR      
 &$	 7 78	J$$!2O %  &s
   #A"8A")r  r  r  )r  rJ  rk   r  rA  s      ``r]   r  #SIMDScheduling.tiling_is_compatibleC  s4     &$'''' 
 &	
 
 	
r_   c                L    U H  nU R                  XX55      (       d  M  Us  $    g rW   )r  )r  rJ  rk   r  r  rA  s         r]   get_first_compatible_tiling*SIMDScheduling.get_first_compatible_tilingT  s+     %F''oVV % r_   c                ,    U R                  XX45      S   $ r  )r  )r  rJ  rk   r  r  s        r]   r  SIMDScheduling.select_tilingb  s$     ((/

 	r_   c                   US:H  nU R                  U/U/5      n[        R                  " U5       H  n[        UR                  [
        R                  5      (       d  M.  UR                  R                  5       S:X  d  MN  [        R                  R                  (       d  Mo  UR                  5       nUS   n	US   n
U R                  X5      nUS4s  $    [        R                  R                  R                  R                  (       a8  U(       a1  [        R                  R                  (       d  U R!                  XX45      $ U(       d  [        R                  R"                  (       a  [%        SS9S::  a  [&        R(                  [*        R,                  ::  a  [        R                  " U5       Hq  n[        R                  R"                  (       a  M$  [/        U R1                  XrU5      5      S:  d  ME  [&        R3                  [4        R6                  " S5      5          US4$    US4$ [9        5       n[:        R<                  " 5       n[        R                  " U5       Hl  nU R1                  XrU5       HS  nUR>                  U;   a  M  UR>                  b  URA                  UR>                  5        X==   URB                  -  ss'   MU     Mn     URE                  5        VVs/ s H  u  pURF                  PM     nnn[%        SS9S:  aH  U(       aA        SS	 jn[I        S[/        U5      5       H  nU" US   UU   5      nUc  M  U/U-   n  O   [/        U5      S:  a  [&        R3                  S
U5        [        R                  R                  (       a  U RK                  XU5      U-   nU RM                  XUU5      =n(       a  US4$ US4$ s  snnf )z
Heuristics to decide how to tile kernels.
Currently, we tile based on stride-1 dimensions.

Returns:
    `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

r=   r+  r   Nr   r  z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                :   U S   U R                  SS5      p2US   UR                  SS5      pT[        X5/5      (       d/  [        R                  R                  R                  X5-
  5      S:X  a  g [        R                  R                  R                  X5-
  5      S:  a  XE4X#4su  p#u  pE[        R                  R                  R                  X5-
  5      S:  d   e[        R                  R                  R                  X55      (       d  g U[        X55      UU S   S.nU$ )NrS   rR   r=   r   rT   )rQ   rR   rS   rT   )r   r   r<   r   r   r   r  r   )tiling0r  a0a1b0b1
new_tilings          r]   convert_tiling_to_3dBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d  s     !w{{3':B w{{3':B *2(33ww''11"':a?77##--bg6:*,B8&HRhrww''11"':Q>>>ww''DDRLL !")"5>	
 "!r_   zpossibly bad tiling: %s)r  r  r  r  r   r  )'r  rF   r  r  r   r   rL  rM  r   rZ   rI  r  rX   rY   r  prefer_nd_tilingr  tile_reductionsr^   r  levelloggingWARNINGr  r  r  textwrapdedentr   collectionsr   rh   r   rm  most_commonrA  r  r  r  )r  rJ  rk   r  r  rw  r  r   r  	range_y_xrange_rrA  
seen_namescandidate_tilescandidate_tilingrm  r  r  rX  new_3d_tilings                       r]   r  $SIMDScheduling.get_tiling_and_scoresn  sl   " '!+ **E7_4EF $**=9D$))R%6%677II002e;333 #'//"3K +AI)!nG ..yBF!4<' :  OO""))BB!MM22..o  V]]%B%B}H
H ""goo5+22=AD"MM999 5 5d? STWXX%**$OO!$ !4'' B "4''&0l
4?4G4G4I#**=9D$'$9$9$$W #((J6%**6NN#3#8#8915E5K5KK1 %X : ,;+F+F+H7
+H'  ##+H 	 7

 #q(\"."9N"0"8 1c.12 4"1%~a'8! !,&3_~%EN 3 ~"8.I ==))""=I ! 
 44/>
 
6 
 4<t##A7
s   Oc                    g rW   r   rs   s    r]   flushSIMDScheduling.flush   r  r_   c                    grw  r   rs   s    r]   ready_to_flushSIMDScheduling.ready_to_flush  ry  r_   c           	        [        S U 5       5      (       d  [        US S9R                  u  nu  pVU R                  XU5      nU R	                  XuU5      nU R                  U[        XuU5      S9n	U R                  Xy5        [        R                  " SU5         [        R                  " U	5         U	R                  5       n
S S S 5        S S S 5        OJUS   R                  U5      u  pn[        R                  " SU5         U R                  UUUSUS9n
S S S 5        W
R                  [!        ["        R$                  5      S	5      n
U
$ ! , (       d  f       N= f! , (       d  f       NJ= f! , (       d  f       N[= f)
Nc              3  @   #    U  H  oR                  5       v   M     g 7frW   )r  )r   r   s     r]   r   ASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>	  s     2Eq==??Er  c                4    [        U R                  5       5      $ rW   r  r  s    r]   r   @SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>
  rC  r_   r   )r4  r?  r   Tr/  r@  )r8  r  r  r2  r  r  rI   rB  r   rG  r<   rE  r  get_prologue_template_epiloguer8  rH  r~   r4   rI  )ro   r   r?  r0  r  rk   r  rJ  rA  rm   rP  r  templateepilogues                 r]   generate_kernel_code_from_nodes.SIMDScheduling.generate_kernel_code_from_nodes  sZ    2E222!$U0O!P!V!VA 77fMM''fEF%%+M&I & F 22=I/1AB$$V,!002 - CB
 ,18+R+R,(H 02BC00&*"/ 1  D ##C(?(?$@)L% -, CB DCs0   E/E E?E(
E	E
E%(
E6c                    [         erW   ri  )ro   rP  rJ  rm   s       r]   r  SIMDScheduling.define_kernel(  rn  r_   r   )r[  N)rV  zOptional[OrderedSet[str]]r   ztuple[float, str]rW   )r   z!Sequence[scheduler.SchedulerNode]r  Optional[CoalesceVarAnalysis])r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rk   r   r  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]r   r   )rJ  rI   )rJ  rI   r   zlist[SIMDKernel])r   r   r   tuple[tuple[int, ...], ...])r   r   r   r   )r   r   r*  r   r   r)  )r0  r   r   Optional[str])F)rO  zlist[BaseSchedulerNode]rP  r   rI  r   rJ  r   r  r   r   zlist[tuple[str, Any, Any]])r   r  )r  r  r  r  r   immutable_dict[str, sympy.Expr])rA  r  rw  r   r   r+  )rA  r  rk   r   r  r   r   r+  )r   z%list[immutable_dict[str, sympy.Expr]])
rJ  list[NodeScheduleEntry]r  r   r  r   r  rM   r   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])rJ  r,  rk   r   r  r   rA  r  )rJ  r,  rk   r   r  r   r  zlist[dict[str, sympy.Expr]])r  r(  r   r  )r  r(  r   r-  r   )FN)r0  r   )6r   r   r   r   r   r   r  r$  r  r  can_fuse_verticalr  r2  r7  r<  rQ  rW  r6  r  r  r  r  r  rA  rB  r  r  r%  r-  r8  r=  rZ  rb  r  r   r   r  r  rs  r  r  r  r  r  r   r   r   r  r  r  r  r$  r  r   r   r_   r]   r  r    s   
 (K'Q`6D !"^@8
%2)j RV"5N"	"
\)B <@
0
 9
(=P=2 $$
$
 
$ $LQ)f
1
	
 -T  B'	$,

'
/2
	$
.  '+m %m 
m^I #(< 0<   $<  	< 
 <   <  
$< |)( }  }~ 

,

@T

	(

 

 
$
 
 
)	
 
 /%/ / $	/
 
)/ /( y
 
/y yv [$.[$ $[$ $	[$
 /[$ 
G[$ [$z 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?O$
 9O$ 
GO$ O$b MQ <I D"r_   r  T)frozenc                  H    \ rS rSr% S\S'   S\S'   SrS\S'   \S	 5       rS
rg)rr  i,  r  rA  r   rm  Nr*  rh   c                z    [         R                  R                  R                  U SS9n U S:  =(       a    U S-  S:H  $ )z@Somewhat arbitrary heuristic used to boost scores for some sizesi    r   rd  r   r  )r   s    r]   rv  CandidateTiling.is_good_size2  s:     GG&&q4&8Bw(AFaK(r_   r   )	r   r   r   r   r$  rh   r  rv  r   r   r_   r]   rr  rr  ,  s)    !!JD-) )r_   rr  c                  .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i9  c                :   > [         TU ]  5         Xl        X l        g rW   )rf   rg   r   r  )ro   r   r  rp   s      r]   rg   CantSplit.__init__:  s    	"r_   c                8    U R                    SU R                   3$ )Nz not divisible by r   r  rs   s    r]   __str__CantSplit.__str__?  s    )).t~~.>??r_   r7  )r   r   r   r   rg   r8  r   r   r   s   @r]   r  r  9  s    #
@ @r_   r  )r   )r\   r   r   r   )r  r  r   r~   )
__future__r   r  r  dataclassesr   r;  r	  r  r	  r  r   typingr   r   r   r   r	   typing_extensionsr
   r   rX   torch._loggingtorch._inductorr   torch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher   r    dependenciesr!   r"   r#   collections.abcr$   r&   optimize_indexingr'    runtime.coordinate_descent_tunerr(   runtime.hintsr)   runtime.runtime_utilsr*   r+   r,   r-   r.   r/   utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   virtualizedr:   r;   r<   block_analysisr>   commonr?   r@   rA   rB   r7  rC   rD   simd_kernel_featuresrE   rF   rG   rH   rI   rJ   rK   rL   rM   	getLoggerr   r  _logginggetArtifactLoggerr  r  
fusion_logdoprintr  r  r^   	dataclassra   r   r   r  r  r!  r   r  rr  	Exceptionr  r   r_   r]   <module>r]     s   "           ? ? %    # 2 B G 9 / L L  & $ $ F . 6 6 ( A < , L L D D   - , / P P :  <<@ !00<H~~//*E^^--hA
 	78;
 3+ 3+ 3+lO;/ O;d;'? ;'| +;T   t('/*B tnt"^ t"n9 d#	) 	) $	)@	 @r_   