
    ȅi'o                   	   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJr  S SKJrJr  S SKJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SKJ r!  S SK"J#r#J$r$J%r%  S S	K&J'r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-J.r.  S SK/J0r0  S SK1J2r2J3r3J4r4J5r5J6r6  S SK7J8r8  S SK9J:r:  S SK;J<r<  S SK=J>r>J?r?  SSK@JArAJBrBJCrC  SSKDJErE  SSKCJFrFJGrG  SSKHJIrI  SSKJJKrK  SSKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrVJWrW  SSKXJYrY  SSKZJ[r[J\r\J]r]J^r^J_r_J`r`  SSKaJbrb  SSKcJdrd  SSKeJfrfJgrgJhrh  \(       a$  S S KJiriJjrj  S SKkrkSS!KlJmrm  SS"KCJnrn  SS#KoJprp  SS$KqJrrr  \R                  " \t5      ru\^" 5       R                  rw\x\R                  \R@                  \z\{4   r|\\CR                  \_4   r~\S%/S4   rSkS& jrSlS' jrS( r\\z\4   r\\x\\\R2                  4   S)4   \\/\x\S)4   4   4   r  Sm           SnS* jjrSoS+ jr\GR                   " S, S-5      5       r " S. S/5      r " S0 S%5      r\GR                   " S1 S2\5      5       r\GR                   " S3 S4\5      5       r\GR                   " S5 S6\5      5       r\GR                   " S7 S8\5      5       r\GR                   " S9 S:\5      5       r\GR                   " S; S<\5      5       r " S= S>\5      r\GR                   " S? S@\5      5       r\GR                   " SA SB\5      5       r\GR                   " SC SD\5      5       r\GR                   " SE SF\5      5       r\GR                   " SG SH\5      5       r\GR                   " SI SJ\5      5       r " SK SL5      r\GR                   " SM SN\5      5       r\GR                   " SO SP\5      5       r\GR                   " SQ SR\5      5       r\GR                   " SS ST\5      5       r " SU SV\5      r\GR                   " SW SX\5      5       r\GR                   " SY SZ\5      5       r\GR                   " S[ S\\5      5       r\GR                   " S] S^\5      5       r\GR                   " S_ S`\5      5       r\GR                   " Sa Sb\5      5       r\GR                   " Sc Sd\5      5       r\GR                   " Se Sf\5      5       r\zr\\\S4   r " Sg Sh\\5      r " Si Sj\5      rg)p    )annotationsN)Callable)chaincount)AnyOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timedget_debug_dir)DebugPrinterManager)MultiKernelState)	cache_dir)get_opaque_obj_repris_opaque_value_type)trace_structured)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)CUSTOM_EXTERN_KERNEL_CODEGEN)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)ExternKernel)BaseSchedulerNode)FxConverterWrapperLinec                :   [         R                  R                  U 5      nU R                  5       [         R                  R                  ;  nU R                  5       U R                  5       [        [         R                  R                  R                  U5      5      U4$ N)
r5   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper2   sizevarssimplify)nodestorage_size	alignments      Y/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrW   c   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                   U R                  5       UR                  5       :w  a  gU R                  5       UR                  5       :w  a  g[        R                  R                  R                  [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U5      5      n[        U5      [        U5      :X  d`  [        R                  R                  R                  USU-  5      (       a/  [        R                  R                  R                  X25      (       a  gg)NFgffffff?T)
rO   rP   r5   rK   rQ   rR   rL   r2   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rV   can_match_buffer_sizer`   q   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMMGG11+JJrX   c                H   [        U [        R                  5      (       a?  U R                  5       nUR                  UR
                  UR                  UR                  S4$ / nU n[        U[        R                  [        R                  [        R                  45      (       aq  UR                  5       nUc  gUR                  U5        UR                  n[        U[        R                  [        R                  [        R                  45      (       a  Mq  [        U[        R                  5      (       d  gU H-  nUR                  UR                  5       R                  :w  d  M-    g   UR                  5       nUR                  UR
                  UR                  UR                  S4$ )z
Collapse a chain of ReinterpretView <- StorageBox
<- ReinterpretView <- StorageBox.... <- buffer wrappers if every layer
has the same offset as the innermost (base) buffer.

Returns:
    (size, stride, offset, dtype, collapsible: bool)
T)NNNNF)
isinstancer#   Buffer
get_layoutsizestrideoffsetr   	TensorBox
StorageBoxr&   appenddata)rk   laylayoutscurbase_lays        rV   codegen_reinterpret_view_helperrp      s(    $		""ooxxSZZD@@G
C
S2<<8J8JK
L
Lnn;0shh S2<<8J8JK
L
L c299%%, ::)0000  ~~H==(//8??HNNDPPrX   .c                R  ^ ^^^ [        5       mSS jm S   SUU4S jjjnSSU UU4S jjjnST  3nU" SU S35        T(       a9  [        R                  R                  (       a  TR                  R                  5       O[        R                  " 5       nTR                  5          U   [        R                  R                  (       ab  U(       a[  [        R                  R                  (       a<  U[        R                  R                  ;   a  [        R                  R                  U   n	OS /[        U5      -  n	[        U5      S:X  a!  U" US   U	S   5      u  pU" S	U
 3S	U 35        O[        U5      S:  d   e[        U5      [        U5      :X  d   e[        5       n[        [        X!U	5      S
 SS9 H  u  pn/ nUR                  (       a?  UR                   H/  nUS;  d  M  UR!                  SU SUR                  U    35        M1     U(       a  SR#                  U5      nOSnU" X5      u  pSU SU
 3nUU;   a  M  UR%                  U5        U" USU SU 35        M     S S S 5        S S S 5        UTR'                  5       4$ ! , (       d  f       N(= f! , (       d  f       N1= f)Nc                p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ rJ   )rb   sympyr   Integer)items    rV   _convert_to_sympy_expr@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s&    !$

33tLt9LLrX   c                  > Tb  [        U 5      (       a  X 4$ [        U4S jU  5       5      nU(       d  UnTR                  U5      [        R                  R
                  (       a%  TR                  [        U4S jU 5       5      5      4$ S4$ )z
This function return a tuple of two values: the first one is for the real grid
which is used in the generated code; the second one is an example grid with
concreate values which is used in the autotune block to run the generated
kernels at compile time.
Nc              3  4   >#    U  H  nT" U5      v   M     g 7frJ    ).0grv   s     rV   	<genexpr>Kuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     Cd1!44ds   c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frJ   generate_example_arg_valuetype)r{   r|   wrappers     rV   r}   r~      s,      !-A  ::1d1gFF!-   (+)callabletuplecodegen_python_shape_tupler"   tritonautotune_at_compile_time)gridexample_grid
sympy_gridrv   r   s      rV   determine_grid8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htnn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rX   c                   > TR                  U 5        T(       aV  [        R                  R                  (       a6  TTR                  ;  a%  TR
                  R                  U=(       d    U 5        g g g g rJ   )	writeliner"   r   r   kernel_autotune_nameskernel_autotune_calls)liner   nameoutputr   s     rV   r   3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rX   grid_wrapper_for_def z(meta):r6   r   zreturn c                2    [        U S   R                  5      $ Nr6   lenkwargsxs    rV   <lambda>2user_defined_kernel_grid_fn_code.<locals>.<lambda>  s    c!A$++.rX   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )ru   Union[int, sympy.Expr]return
sympy.ExprrJ   )r   
TritonGridr   zOptional[TritonGrid])r   strr   Optional[str])r-   r"   r   r   r   indent
contextlibnullcontextr5   rK   autotuning_gridsr   r   sortedzipr   rj   joinaddgetvalue)r   configsgridsr   original_fxnode_namer   r   fn_namekernel_autotune_calls_indentexample_gridsr   r   seenc
guardslistkwargguards	statementrv   r   s   `  `              @@rV    user_defined_kernel_grid_fn_coder      sb    FM
 .2

*
 
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
6MM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KDv&',(@Au:>!>u:W---$.LD *0EM2.*%
  
88!"  ) 
 '--ugVAHHUOCT.UV "* $\\*5F#F%3D%G"!&4&9	$#)s6()L>%JK1*- 7` FOO%%%a 76s,   %J(D8J$BJ%J
J	J
J&c                   ^^^^^^ [        5       mTR                  U R                  SS9  SSKmSSKJm  SSKJm  [        U R                  /5      mUUUUUU4S jmT" U 5        TR                  5       $ )z[
Given a triton kernel function pointer collect the transitive closure of
its dependencies
Tstripr   N)JITFunction)	constexprc           	     \  > [        S [        R                  " U R                  5       5       5      nU R                  R                  R                  S0 5      nU R                  R                  R                   GH  nUT;   a  M  X0R                  R                  ;   d  M'  U R                  R                  U   n[        UT5      (       aV  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        M  [        TS5      (       a  [        UTR                  R                   R"                  5      (       aW  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        GM>  [        U[$        [&        [(        T
45      (       a  T	R                  5         [        UT
5      (       a  SUR*                  < S	3nOU< nUR                  U5      =n(       aQ  [        U[,        5      (       a  S
UR.                   SUR0                   3nOS
U< 3nT	R                  U U SU 35        OT	R                  U SU 35        TR                  U5        GM)  X1;   d  GM1  US:w  d  GM:  [        US5      (       d  GMN  UR.                  R3                  S5      (       d  GMq  T	R                  SUR.                   SUR0                   SU 35        TR                  U5        GM     g )Nc              3  ^   #    U  H#  nUR                   S :X  d  M  UR                  v   M%     g7f)LOAD_GLOBALN)opnameargval)r{   insts     rV   r}   ^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr><  s)      '
3{{m+ DKK3s   --__annotations__z@triton.jitTr   constexpr_functionz@triton.constexpr_functionztl.constexpr(): . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesrb   newliner   splicesrcr   hasattrruntimejitConstexprFunctionintr   boolvaluer   r   __name__
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverser   s           rV   r   Kuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse7  s   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11::K..mm777#22;?fk22#++-#--m<#**6::T*B$((5V$V%9::zNN&&88@ @ $++-#--.JK#**6::T*B$((5V$c4(CDD#++-!&)44'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$77"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((54#t+55 ))44X>>
 $-- 1 12(6??:K4P[}] %((5o ;rX   )
r-   r   r   r   r   triton.languager   r   r   r   )kernelr   r   r   r   r   r   s    @@@@@@rV   9user_defined_triton_kernel_transitive_closure_source_coder   '  si    
 %&O6::T2 ") "6??"34B6 B6H V##%%rX   c                  0    \ rS rSr% S\S'   S\S'   S rSrg)	SymbolicCallArgi  sympy.Symbolinnerr   
inner_exprc                ,    [        U R                  5      $ rJ   )r   r   selfs    rV   __str__SymbolicCallArg.__str__  s    4::rX   rz   N)r   r   __qualname____firstlineno__r   r  __static_attributes__rz   rX   rV   r   r     s    rX   r   c                  F   ^  \ rS rSrU 4S jrSS jrSS jrS	S jrSrU =r	$ )
MemoryPlanningStatei  c                n   > [         TU ]  5         [        R                  " [        5      U l        SU l        g Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r  	__class__s    rV   r  MemoryPlanningState.__init__  s-    ##D) 	 12(rX   c                L    [        U R                  R                  US 5      5      $ rJ   )r   r  r   )r  r   s     rV   __contains__ MemoryPlanningState.__contains__  s    DOO''T233rX   c                f    U R                   U   R                  5       nUR                  (       a   eU$ rJ   )r  pop	is_reusedr  r   ru   s      rV   r  MemoryPlanningState.pop  s+    s#'')>>!!rX   c                f    UR                   (       a   eU R                  U   R                  U5        g rJ   )r  r  rj   r  s      rV   pushMemoryPlanningState.push  s&    >>!!##D)rX   )r  r  )r   ReuseKeyr   r   )r   r!  r   FreeIfNotReusedLine)r   r!  ru   r"  r   None)
r   r   r  r  r  r  r  r  r  __classcell__r  s   @rV   r
  r
    s    24
* *rX   r
  c                      \ rS rSrSS jrSrg)rH   i  c                0    [        S[        U 5       35      e)Nz&FX codegen not yet supported for type )NotImplementedErrorr   r  	converters     rV   
codegen_fxWrapperLine.codegen_fx  s    !$J4PT:,"WXXrX   rz   Nr*  rG   r   FxConversionFuncr   r   r  r  r+  r  rz   rX   rV   rH   rH     s    YrX   c                  H    \ rS rSr% S\S'   S\S'   SS jrSS jrSS jrS	rg
)EnterSubgraphLinei  PythonWrapperCodegenr   rD   rK   c                b    U R                   R                  U R                   R                  5        g rJ   )r   push_computed_sizescomputed_sizesr  s    rV   __post_init__EnterSubgraphLine.__post_init__  s    (()D)DErX   c                n    U R                   R                  U R                  5        UR                  5         g rJ   )r   push_codegened_graphrK   	do_indentr  codes     rV   codegenEnterSubgraphLine.codegen  s"    ))$**5rX   c                    UR                   $ rJ   )_generate_enter_subgraphr)  s     rV   r+  EnterSubgraphLine.codegen_fx  s    111rX   rz   Nr   r#  r<  r-   r   r#  r-  	r   r   r  r  r   r6  r=  r+  r  rz   rX   rV   r1  r1    s    !!F2rX   r1  c                  H    \ rS rSr% S\S'   S\S'   S
S jr\SS j5       rSrg	)ConditionalLinei  r2  r   zir.ConditionalrS   c                    [        S5      e)NzOnly supports FX codegen)r(  r;  s     rV   r=  ConditionalLine.codegen  s    !"<==rX   c                    U R                   $ rJ   )_generate_conditionalr*  s    rV   r+  ConditionalLine.codegen_fx  s    ...rX   rz   NrC  r-  	r   r   r  r  r   r=  staticmethodr+  r  rz   rX   rV   rF  rF    s'    !!
> / /rX   rF  c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
CommentLinei  r0   r   c                :    UR                  U R                  5        g rJ   )r   r   r;  s     rV   r=  CommentLine.codegen  s    tyy!rX   c                    U R                   $ rJ   )_generate_commentrK  s    rV   r+  CommentLine.codegen_fx  s    ***rX   rz   NrC  r-  rM  rz   rX   rV   rP  rP    s!    
" + +rX   rP  c                  H    \ rS rSr% S\S'   S\S'   S
S jr\SS j5       rSrg	)DynamicScalarLinei  r2  r   zir.DynamicScalarrS   c                N    U R                   R                  U R                  5        g rJ   )r   _codegen_dynamic_scalarrS   r;  s     rV   r=  DynamicScalarLine.codegen  s    ,,TYY7rX   c                    U R                   $ rJ   )_generate_dynamic_scalarrK  s    rV   r+  DynamicScalarLine.codegen_fx  s    111rX   rz   NrC  r-  rM  rz   rX   rV   rW  rW    s'    !!
8 2 2rX   rW  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)ExitSubgraphLinei  r2  r   c                V    U R                   R                  5       U R                   l        g rJ   )r   pop_computed_sizesr5  r  s    rV   r6  ExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rX   c                X    U R                   R                  5         UR                  5         g rJ   )r   pop_codegened_graphdo_unindentr;  s     rV   r=  ExitSubgraphLine.codegen  s    ((*rX   c                    UR                   $ rJ   )_generate_exit_subgraphr)  s     rV   r+  ExitSubgraphLine.codegen_fx  s    000rX   rz   NrB  rC  r-  rD  rz   rX   rV   r_  r_    s    !!H1rX   r_  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)EnterDeviceContextManagerLinei  r   
device_idxOptional[int]last_seen_device_guard_indexc                   [         R                  R                  (       Ga  UR                  S5        [         R                  R                  (       aj  U R
                  c;  UR                  [         R                  R                  R                  5        S35        g U R
                  U R                  :X  d   S5       eg U R
                  cH  UR                  [         R                  R                  R                  5        SU R                   S35        g UR                  SU R                   S35        g UR                  S[         R                  R                  R                  U R                  5       S35        UR                  5         UR                  [         R                  R                  R                  U R                  5      5        g )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r5   rK   cpp_wrapperr   aot_modern  
device_opscpp_aoti_stream_guardrl  cpp_aoti_device_guarddevice_guardr:  
set_devicer;  s     rV   r=  %EnterDeviceContextManagerLine.codegen  sM   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrX   c                    UR                   $ rJ   )&_generate_enter_device_context_managerr)  s     rV   r+  (EnterDeviceContextManagerLine.codegen_fx
  s    ???rX   rz   NrC  r-  r   r   r  r  r   r=  r+  r  rz   rX   rV   rk  rk    s    O"//K:@rX   rk  c                  (    \ rS rSrSS jrSS jrSrg)ExitDeviceContextManagerLinei  c                d    [         R                  R                  (       d  UR                  5         g g rJ   )r5   rK   rs  re  r;  s     rV   r=  $ExitDeviceContextManagerLine.codegen  s     ww"" #rX   c                    UR                   $ rJ   )%_generate_exit_device_context_managerr)  s     rV   r+  'ExitDeviceContextManagerLine.codegen_fx  s    >>>rX   rz   NrC  r-  r   r   r  r  r=  r+  r  rz   rX   rV   r  r    s    ?rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelAllocLinei  r2  r   ir.ExternKernelAllocrS   c                    U R                   n/ UR                  5       QUR                  5       QnU R                  R	                  U R                   U5        g rJ   )rS   codegen_argscodegen_kwargsr   $_generate_extern_kernel_alloc_helper)r  r<  rS   argss       rV   r=  ExternKernelAllocLine.codegen  sD    yy=""$=t':':'<=99$))TJrX   c                    UR                   $ rJ   )_generate_extern_kernel_allocr)  s     rV   r+   ExternKernelAllocLine.codegen_fx!  s    666rX   rz   NrC  r-  r~  rz   rX   rV   r  r    s    !!
K
7rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelOutLinei%  r2  r   ir.ExternKernelOutrS   c           	     ^   U R                   n/ UR                  5       QUR                  SS9QnUR                  5       n[        R
                  R                  (       a  UR                  S:X  a  SnOUR                  5       nUR                  5       =n(       a  UR                  O[        R
                  R                  nU R                  R                  UUR                  5       UR                  (       a  UR                  R                  5       OS UUU R                   R                  5       5        g )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)rS   r  r  get_kernel_namer5   rK   rs  cpp_kernel_name
get_devicer   device_typer   "_generate_extern_kernel_out_helpercodegen_referenceoutput_viewget_stack_traces)r  r<  rS   r  kernel_nameddevices          rV   r=  ExternKernelOutLine.codegen*  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L77""$484D4DD..0$II&&(	
rX   c                    UR                   $ rJ   )_generate_extern_kernel_outr)  s     rV   r+  ExternKernelOutLine.codegen_fx@      444rX   rz   NrC  r-  r~  rz   rX   rV   r  r  %  s    !!

,5rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)FreeLineiD  r2  r   %Union[BufferLike, ir.TorchBindObject]rS   c                    U R                   R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   5      5        g rJ   )rS   rM   r5   rK   removed_buffersr   r   make_buffer_freer;  s     rV   r=  FreeLine.codegenI  sF    yy!!#177+B+BBBBt||44TYY?@rX   c                    UR                   $ rJ   )_generate_freer)  s     rV   r+  FreeLine.codegen_fxM      '''rX   rz   NrC  r-  r~  rz   rX   rV   r  r  D  s    !!
//A(rX   r  c                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S\S'   S\S'   S\S'   S\S'   SS jrSS jrSrg)KernelCallLineiQ  r2  r   r   r  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   r   zdict[str, Any]triton_metaztorch.devicer  
graph_namer   c                   U R                   R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  S9
  g )N)r   r  r  r  r  r  r  r   )r   _generate_kernel_call_helperr  r  r   r  r  r  r  r  r  r   r;  s     rV   r=  KernelCallLine.codegen_  se    11NN;;nn]]]]((;;!%!:!: 	2 	
rX   c                    UR                   $ rJ   )_generate_kernel_callr)  s     rV   r+  KernelCallLine.codegen_fxm      ...rX   rz   NrC  r-  r~  rz   rX   rV   r  r  Q  sL    !!LO
/rX   r  c                  r    \ rS rSr% S\S'   S\S'   S\S'   SrS\S	'   S
rS\S'   SrS\S'   SS jrSS jr	Sr
g)KernelDefinitionLineiq  r2  r   r   r  kernel_bodyNr   metadataTr   gpucpp_definitionc                    U R                   R                  U R                  U R                  U R                  U R
                  U R                  S9  g N)r  r  r  )r   _define_kernel_helperr  r  r  r  r  r;  s     rV   r=  KernelDefinitionLine.codegenz  sB    **]].. 	+ 	
rX   c                    UR                   $ rJ   )_generate_kernel_definitionr)  s     rV   r+  KernelDefinitionLine.codegen_fx  r  rX   rz   rC  r-  )r   r   r  r  r   r  r  r  r=  r+  r  rz   rX   rV   r  r  q  s<    !!"Hm"C$(NM(
5rX   r  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)MemoryPlanningLinei  r2  r   c                    U $ )zFirst pass to find reuserz   r  states     rV   planMemoryPlanningLine.plan  s    rX   c                    g)zSecond pass to output codeNrz   r;  s     rV   r=  MemoryPlanningLine.codegen  s    rX   c                |   / n[         R                  " U 5       Hw  nUR                  S:X  a  M  [        XR                  5      nUR	                  UR                   SUR
                  [        R                  L a  UR                  5       OU 35        My     [        U 5      R                   SSR                  U5       S3$ )z6
Emits a string representation that fits on one line.
r   =(, r   )dataclassesfieldsr   getattrrj   r   r#   rc   rM   r   r   )r  r  fieldvals       rV   r  MemoryPlanningLine.__str__  s      ''-EzzY&$

+CKK::,a%**		2IsST	 . t*%%&a		$'8::rX   rz   Nr  r
  r   r  rC  r   r   )	r   r   r  r  r   r  r=  r  r  rz   rX   rV   r  r    s    !!);rX   r  c                  8    \ rS rSrS rSS jrS	S jrS	S jrSrg)
EfficientPeakEstimatei  c                   SSK JnJn  [        R                  R
                  R                  n[        [        R                  R                  R                  5       5      n[        [        R                  R                  5       5      nU" X45      nU" UUU5      u  U l        nSSKJn  U" U[        R                  [         S5      U l        g )Nr    )estimate_peak_memoryget_freeable_input_bufr6   )SegmentedTreer   )memoryr  r  r5   rK   	schedulernodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer  operatorr   max)	r  r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder  s	            rV   r  EfficientPeakEstimate.__init__  s    I''++11!!''"6"6";";"=>"177#;#;#=>!7!V;O"<
8 "8 	2+"HLL#q
rX   c                    [         R                  R                  R                  [         R                  R	                  U5      SS9[        UR                  5       5      -  $ )Nr   fallback)r5   rK   rQ   	size_hintrL   r,   rP   r  rS   s     rV   	_get_sizeEfficientPeakEstimate._get_size  sL    ww))GG//5 * 
4>>+,- 	-rX   c                n    U R                   R                  UR                  S-   UR                  S-
  5      $ r   )r  summarize_rangescheduler_node_indexr  line_aline_bs      rV   peak_between"EfficientPeakEstimate.peak_between  s6    ""22''!+V-H-H1-L
 	
rX   c                    UR                   S-   UR                   :X  a  g U R                  R                  UR                   S-   UR                   S-
  U R                  UR                  5      5        g r   )r  r  update_ranger   rS   r  s      rV   update_peak_between)EfficientPeakEstimate.update_peak_between  s^    &&*f.I.II((''!+''!+NN6;;'	
rX   )r  r  N)rS   
BufferLiker   r   )r  r"  r  AllocateLine)	r   r   r  r  r  r   r  r  r  rz   rX   rV   r  r    s    
&-



rX   r  c                  N    \ rS rSr% S\S'   S rSS jrSS jrSS jrSS jr	S	r
g
)r  i  r  rS   c                   [         R                  R                  R                  c   e[         R                  R                  R                  R                  [         R                  R                  R                  5      U l        g rJ   r5   rK   r  current_noder  indexr  r  s    rV   r6  AllocateLine.__post_init__  T    ww  --999$%GG$5$5$;$;$A$AGG**%
!rX   c                    UR                   S-   U R                   :X  a  gU R                  R                  R                  nU R                  R                  R	                  X5      nX$-   nXS:*  $ )Nr6   T)r  r   estimate_peakr  r  )r  	free_linere   r  peak_memory_in_rangenew_peak_memorys         rV   should_reuse_buffer AllocateLine.should_reuse_buffer  s`    ))A-1J1JJ"ll88LL#||99FFyW555rX   c           	     *   U R                   R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        U R                   5      n[        R                  (       Ga
  X!;   Ga  UR                  U5      n[        R                  R                  R                  [        R                  R                  U R                   5      SS9[        U R                   R                  5       5      -  nU R!                  X45      (       aW  SUl        U R                  R$                  R'                  X05        [)        U R                  UR                   U R                   5      $ UR+                  X#5        U $ U R                   R-                  5       R.                  S:X  aj  U R                  R1                  U R                   5      nUbB  U=R2                  [5        [6        R8                  " [:        R<                  US5      5      -  sl        U $ )Nr   r  Tcpur6   )rS   rM   r5   rK   r  NullLiner   rW   r"   allow_buffer_reuser  rQ   r  rL   r,   rP   r  r  r  r  	ReuseLiner  rO   r   static_shape_for_buffer_or_noner  r   	functoolsreducer  mul)r  r  r   r  re   static_shapes         rV   r  AllocateLine.plan  s{   99177#:#::DLL)) tyy)$$$		#I77##--33DII> . tyy22456D ''	88&*	#**>>yO y~~tyyII

3*99((*//58<<GG		RL'11S$$X\\<C6 1 rX   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  U R                   5      nUR                  U5        g rJ   )rS   rM   r5   rK   r  r   make_buffer_allocationr   r  r<  r   s      rV   r=  AllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trX   c                    UR                   $ rJ   )_generate_allocater)  s     rV   r+  AllocateLine.codegen_fx  s    +++rX   r  N)r  r"  re   r   r   r   r  rC  r-  )r   r   r  r  r   r6  r  r  r=  r+  r  rz   rX   rV   r  r    s!    

68
,rX   r  c                  R    \ rS rSr% S\S'   SrS\S'   S rSS jrSS	 jrSS
 jr	Sr
g)r"  i   r  rS   Fr   r  c                   [         R                  R                  R                  c   e[         R                  R                  R                  R                  [         R                  R                  R                  5      U l        g rJ   r  r  s    rV   r6  !FreeIfNotReusedLine.__post_init__  r  rX   c                   [        U R                  R                  5       5      S:  a  U $ [        U R                  R                  [
        R                  5      (       a  U $ U R                  (       a   eU R                  R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        R                  (       a%  UR!                  [#        U R                  5      U 5        U $ r  )r   rS   get_inputs_that_alias_outputrb   layoutr#   MultiOutputLayoutr  rM   r5   rK   r  r   r   r"   r!  r  rW   r  s     rV   r  FreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<==K>>!!99177#:#::DLL))$$JJ'		2D9rX   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  (       d5  UR                  U R                  R                  U R                   5      5        g g rJ   )	rS   rM   r5   rK   r  r  r   r   r  r;  s     rV   r=  FreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rX   c                    UR                   $ rJ   )_generate_free_if_not_reusedr)  s     rV   r+  FreeIfNotReusedLine.codegen_fx      555rX   r0  Nr  rC  r-  )r   r   r  r  r   r  r6  r  r=  r+  r  rz   rX   rV   r"  r"     s'    
It

E
6rX   r"  c                  R    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS jrSS	 jrS
rg)ReinterpretLinei   r  rS   	reused_asz	ir.Layoutr6  c                    U $ rJ   rz   r  s     rV   r  ReinterpretLine.plan&  s    rX   c                T   [        U R                  [        R                  5      (       d   e[        U R                  R                  [        R
                  5      (       d   eU R                  R                  U R                  R                  5       U R                  R                  5        g rJ   )
rb   r6  r#   NonOwningLayoutviewr&   r   codegen_deferred_allocationrA  rM   r;  s     rV   r=  ReinterpretLine.codegen)  sp    $++r'9'9::::$++**B,>,>????00NN##%t{{'7'7	
rX   c                    UR                   $ rJ   )_generate_reinterpretr)  s     rV   r+  ReinterpretLine.codegen_fx0  r  rX   rz   Nr  rC  r-  )	r   r   r  r  r   r  r=  r+  r  rz   rX   rV   r@  r@     s#    

/rX   r@  c                  V    \ rS rSr% S\S'   S\S'   SrS\S'   SS jrSS	 jrSS
 jrSr	g)r"  i4  r  rS   rA  Tr   
delete_oldc                |   U R                   R                  5       [        R                  R                  ;   aM  U R
                  R                  5       [        R                  R                  ;   d   e[        U R                  5      $ U R
                  R                  5       [        R                  R                  ;  d   eU $ rJ   )rS   rM   r5   rK   r  rA  r   r   r  s     rV   r  ReuseLine.plan:  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrX   c                x   U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   U R
                  U R                  5      5        g rJ   )
rS   rM   r5   rK   r  rA  r   r   make_buffer_reuserM  r;  s     rV   r=  ReuseLine.codegenA  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rX   c                    UR                   $ rJ   )_generate_reuser)  s     rV   r+  ReuseLine.codegen_fxH  s    (((rX   rz   Nr  rC  r-  )
r   r   r  r  r   rM  r  r=  r+  r  rz   rX   rV   r"  r"  4  s'    
J
)rX   r"  c                      \ rS rSrSS jrSrg)r   iL  c                    UR                   $ rJ   )_generate_nullr)  s     rV   r+  NullLine.codegen_fxM  r  rX   rz   Nr-  r/  rz   rX   rV   r   r   L  s    (rX   r   c                  f    \ rS rSr% S\S'   S\S'   \SS j5       r\SS j5       r\SS j5       rS	r	g
)CommBufferLineiQ  r2  r   	ir.BufferrS   c                    SSK Jn  U R                  R                  5       nU R                  R	                  5       nU" U5      (       a  [        SU R                   35      e[        U5      UR                  -  $ )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr^  rS   	get_numelrP   AssertionErrorr   itemsize)r  r^  numelr   s       rV   re   CommBufferLine.sizeV  sd    5		##%		##%u ?		{K  5zENN**rX   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ rJ   )rS   get_output_specrb   r#   CommBufferLayoutcomm_buffer_typer  r6  s     rV   rh  CommBufferLine.comm_buffer_typeb  s9    **,&""5"56666&&&rX   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ rJ   )rS   rf  rb   r#   rg  
group_nameri  s     rV   rl  CommBufferLine.group_nameh  s9    **,&""5"56666   rX   rz   Nr   r   )r   zir.CommBufferTyper  )
r   r   r  r  r   propertyre   rh  rl  r  rz   rX   rV   r[  r[  Q  sG    !!
O	+ 	+ ' '
 ! !rX   r[  c                  8    \ rS rSrSS jr\S 5       rSS jrSrg)	CommBufferAllocateLineio  c                &   U R                   R                  5       [        R                  R                  ;  d   eU R                   R                  5       nU R                   R                  5       nU R                   R                  5       n[        U R                   R                  5       5      n[        U R                   R                  5       5      nUR                  U R                  U R                  U R                  U R                  UUUUU5      5        g rJ   )rS   rM   r5   rK   r  r  rP   r   get_size
get_strider   make_allocation_linerh  rl  r   )r  r<  r   r  r   shaperf   s          rV   r=  CommBufferAllocateLine.codegenq  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rX   c                   U [         R                  R                  :X  aT  U SUR                  U5       SUR                  U5       SU SUR                   SU S[
        R                  " SS5       S3$ [        S	U  35      e)
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r#   CommBufferTypeSYMM_MEMcodegen_shape_tupler  randomrandintr(  )rh  rl  r   r   r  r   rv  rf   s           rV   ru  +CommBufferAllocateLine.make_allocation_line  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rX   c                    UR                   $ rJ   )_generate_comm_buffer_allocater)  s     rV   r+  !CommBufferAllocateLine.codegen_fx      777rX   rz   NrC  r-  )	r   r   r  r  r=  rN  ru  r+  r  rz   rX   rV   rq  rq  o  s     
(  $8rX   rq  c                  (    \ rS rSrSS jrSS jrSrg)CommBufferFreeLinei  c                    U R                   R                  U R                  5      nUR                  U SU R                  R
                   S35        g )Nz # z buffer free)r   r  rS   r   rh  r   r+  s      rV   r=  CommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrX   c                    UR                   $ rJ   )_generate_comm_buffer_freer)  s     rV   r+  CommBufferFreeLine.codegen_fx      333rX   rz   NrC  r-  r  rz   rX   rV   r  r    s    N4rX   r  c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)MultiOutputLinei  zM
Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
r2  r   r   result_namearg_nameSequence[Any]indicesc                   ^ ^ UU 4S jmT" T R                   T R                  5      nUR                  T R                  R                   T R
                   SU T R                  R                   35        g )Nc                  > [        U5      S:  a  US   u  p#[        U[        5      (       a  T" U  SU S3USS  5      $ [        U[        5      (       a;  TR                  R                  U TR                  [        U5      5      nT" XASS  5      $ [        U[        5      (       a  T" U  SU S3USS  5      $ [        SU5      eU $ )Nr   []r6   z['z']znon supported index type: )
r   
issubclassr  r   r   codegen_tuple_accessr  r   dictra  )basenamer  itypeituple_accesscodegen_list_tuple_accessr  s        rV   r  :MultiOutputLine.codegen.<locals>.codegen_list_tuple_access  s    7|a"1:eT**4z1#Q5GQRQSUUu--#'<<#D#D $"2"2CF$L 5\12;OOt,,4zA3b5I7STSU;WW()EuMMrX   r   )r  r  r   r   declarer  ending)r  r<  r   r  s   `  @rV   r=  MultiOutputLine.codegen  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rX   c                    UR                   $ rJ   )_generate_multi_outputr)  s     rV   r+  MultiOutputLine.codegen_fx  s    ///rX   rz   NrC  r-  )	r   r   r  r  __doc__r   r=  r+  r  rz   rX   rV   r  r    s*     "!M
00rX   r  c                  H    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS	 jrS
rg)IndexPutFallbackLinei  r2  r   ir.IndexPutFallbackrS   zlist[Optional[ir.IRNode]]r  c                   U R                   n[        R                  " UR                  5      (       d   eS UR                  S S  5       u  p4U R                   Vs/ s H0  nU(       a  UR                  5       OU R                  R                  PM2     nnU R                  R                  " UR                  5       X6U/UR                  5       Q76   g s  snf )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r{   ts     rV   r}   /IndexPutFallbackLine.codegen.<locals>.<genexpr>  s     Fo**,,o   r    )rS   r#   is_node_sequenceinputsr  r  r   none_str_generate_index_put_fallbackr  codegen_const_args)r  r<  rS   r   valuesidxr  s          rV   r=  IndexPutFallbackLine.codegen  s    yy""4;;////Fdkk"1oF ||
# (+C!!#0E0EE# 	 

 	11  "A	
9=9P9P9R	

s   7Cc                    UR                   $ rJ   )r  r)  s     rV   r+  IndexPutFallbackLine.codegen_fx  r>  rX   rz   NrC  r-  r~  rz   rX   rV   r  r    s    !!
&&
6rX   r  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ScatterFallbackLinei  r2  r   ir.ScatterFallbackrS   c           
     P   U R                   n[        R                  " UR                  5      (       d   eUR                  (       a  S UR                   5       u  p4nO$S UR                   5       u  p4UR
                  S   nUR                  5       =n(       a  UR                  O[        R                  R                  nU R                  R                  UX2R
                  S   XE/UR                  UR                  UR                  UR                  S   UR!                  5       U5        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r}   .ScatterFallbackLine.codegen.<locals>.<genexpr>  s     Jk2244kr  c              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r}   r    s     EA--//r  r6   r   r%  )rS   r#   r  r  src_is_tensorconstant_argsr  r   r5   rK   r  r   _generate_scatter_fallbackr  python_kernel_namer   r  )r  r<  rS   r   r  r   r  r  s           rV   r=  ScatterFallbackLine.codegen  s    yy""4;;////JdkkJOQsEEJQ$$Q'C!%!22A29L9L//""1%u2  ##KK!!		
rX   c                    UR                   $ rJ   )r  r)  s     rV   r+  ScatterFallbackLine.codegen_fx  r  rX   rz   NrC  r-  r~  rz   rX   rV   r  r    s    !!

(4rX   r  c                  H    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS	 jrS
rg)SymbolicCallArgLinei  r2  r   r   argrD   rK   c                d    U R                   R                  U R                  U R                  5        g rJ   )r   "_generate_symbolic_call_arg_helperr  rK   r;  s     rV   r=  SymbolicCallArgLine.codegen  s    77$**MrX   c                    UR                   $ rJ   )_generate_symbolic_call_argr)  s     rV   r+  SymbolicCallArgLine.codegen_fx
  r  rX   rz   NrC  r-  r~  rz   rX   rV   r  r    s    !!	N5rX   r  c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)UnbackedSymbolDefsLinei  r2  r   r   output_namer   outputs,Optional[dict[sympy.Symbol, pytree.KeyPath]]unbacked_bindingsc                z    U R                   R                  U R                  U R                  U R                  5        g rJ   )r   )_codegen_unbacked_symbol_defs_for_outputsr  r  r  r;  s     rV   r=  UnbackedSymbolDefsLine.codegen  s+    >>dllD,B,B	
rX   c                    UR                   $ rJ   )_generate_unbacked_symbol_defsr)  s     rV   r+  !UnbackedSymbolDefsLine.codegen_fx  r  rX   rz   NrC  r-  r~  rz   rX   rV   r  r    s#    !!LCC

8rX   r  c            	        ^  \ rS rSrSrSrU 4S jr\ S       SS jj5       rSS jr	SS jr
SS	 jrSS
 jrSS jr\SS j5       rSS jr\SS j5       rSS jr\SS j5       rSS jr  SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrS r S r!S r"S  r#S! r$SS" jr%SS# jr&SS$ jr'SS% jr(SS& jr)SS' jr*SS( jr+SS) jr,SS* jr-S+ r.    SS, jr/ S             SS- jjr0SS. jr1SS/ jr2SS0 jr3S1 r4SS2 jr5S3 r6SS4 jr7S5 r8              SS6 jr9S7 r:SS8 jr;\<Rz                  SS9 j5       r>SS: jr?S; r@S< rAS= rBS> rCSS? jrD      SS@ jrESA rFSSB jrGSC rHSSD.SSE jjrISSD.SSF jjrJSSG jrKSSH jrLSSI jrMSSJ jrN  SSK jrO S   SSL jjrPSSM jrQSSN jrRSO rSSP rTSQ rUSR rVSS rWST rX   S         SSU jjrY\ S     SSV jj5       rZ   S         SSW jjr[SSX jr\  SSY jr]SSSZ jjr^      SS[ jr_SS\ jr`SS] jraS^ rbS_ rcS` rdSa reSb rfSc rgSd rhSe riSf rjSSg jrkSh rlSSSSSSSSi. SSj jjrmSSSSSSSkSSl. SSm jjrnSn roSo rpSp rqSSq jrrSSr jrs\Ss 5       rt SSt jruSu rvSSv jrwSSw jrxSSx jrySSy jrz S SSz jjr{SS{ jr|SS| jr}SS} jr~S~ rSS jrS rSS jrS r        SS jr        SS jrS r    SS jrSS jrS rS rS rS rS rS rSS jrS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r    SS jrS rS rSrU =r$ )r2  i"  z:
Generate outer wrapper in Python that calls the kernels.
Tc                  >^  [         TT ]  5         [        5       T l        0 T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l	        [        5       T l
        [        5       T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l        0 T l        ST l        0 T l        [        5       T l        / T l        ST l        ST l        ST l        ST l        ST l        [6        R8                  R:                  (       a  SOST l        [6        R8                  R:                  (       a  SOST l        S T l         ST l!        0 T l"        [        5       T l#        [        5       T l$        S T l%        T RM                  5         / T l'        / T l(        T RS                  5         [U        T 5      (       d  T RW                  5         T RY                  5         [6        R8                  RZ                  (       dB  [6        R8                  R\                  R_                  5        H  u  pT Ra                  X5        M     [        [b           " 5       T l2        [        [b           " 5       T l3        0 T l4        [j        Rl                  " S 5      " T Rn                  5      T l7        [j        Rp                  S
U 4S jj5       nUT l9        0 T l:        [        5       T l;        [y        5       T l=        [        5       T l>        0 T l?        [        [        R                  R                  [        R                  R                  S	9T lE        / T lF        g )Nr    #r#  z
std::move(r   Tc                   > TR                   R                  U 5        [        R                  R                  (       a  TR
                  R                  U 5        g g rJ   )importsr   r"   r   r   r   )r   r  s    rV   add_import_once6PythonWrapperCodegen.__init__.<locals>.add_import_oncer  s;    LL""4(}}55**44T: 6rX   )debug_printer_leveluse_array_ref)r   r   r   r#  )Gr  r  r   _names_iterargs_to_buffersr-   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr  declare_maybe_referencer  commentr  r5   rK   rs  
move_beginmove_endrn  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr5  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr.   write_prefix!write_kernel_autotune_defs_headerrt  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr$  	lru_cachewrite_get_raw_streamcacher  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r"   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r  r   hashedr  r  s   `   rV   r  PythonWrapperCodegen.__init__)  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(QS&L 	" 9C $!!# &("$&!2488..0ww ! 6 6 < < >##D1 !? $J/1
+-
 57$-$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rX   Nc                P    U (       a  Uc   eUc   e[        XU5      $ [        5       $ rJ   )SubgraphPythonWrapperCodegenr2  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rV   createPythonWrapperCodegen.create  s=      ,,,!---//C  $%%rX   c                    SU l         g )Ncall)r  r  s    rV   r  )PythonWrapperCodegen.set_launcher_fn_name  s
     &rX   c                D    U R                   R                  U SU 35        g )Nz = None  # )r  r   )r  r   r  s      rV   r  #PythonWrapperCodegen.write_constant  s    k&:;rX   c                   [         R                  R                  R                  5       nSnUb  UR                  b  SUR                   3nSn[        [        R                  R                  5      S:  a  SnO5[         R                  R                  R                  R                  (       a  SnU R                  R                  SU S[        R                   S	U S
3SS9  U R                   R                  SSS9   SSKJn  U R                   R                  SSS9  [        R*                  (       a  U R                   R-                  S5        [        R.                  R0                  (       Ga  U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        U R                   R-                  S5        [        R.                  R2                  =(       d(    [4        R6                  R9                  [;        5       S5      nU R                   R-                  SU S35        SU S3nSU S3n[        R.                  R<                  n[        R.                  R>                  n	[        R.                  R@                  n
U R                   R-                  S5        U R                   R                  SU S U S!U	 S"U
 S#3	5        U R                   R-                  S$U S%35        U R                   R-                  S&5        gg! [&        [(        4 a     GNaf = f)'z>Write the header section of the generated Python wrapper code.r  Nz
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtxz import triton.profiler as protonz%import triton.profiler.language as plzCfrom triton.profiler.hooks import HookManager as _ProtonHookManagerzimport tritonzimport atexitz	import oszetriton.set_allocator(lambda size, align, stream: torch.empty(size, dtype=torch.uint8, device='cuda'))protonzos.makedirs("z", exist_ok=True)zos.path.join("z", "inductor")z", "inductor.chrome_trace")z^from torch._inductor.runtime.proton_utils import process_proton_trace as _proton_process_tracez
                def _proton_finalize_and_postprocess():
                    proton.finalize()
                    _trace_path = z
                    if os.path.exists(_trace_path):
                        _proton_process_trace(
                            _trace_path,
                            group_by_sm=z0,
                            split_invocations=z0,
                            per_cta_occupancy=z,,
                        )
                z5if not _ProtonHookManager.active_hooks: proton.start(z], backend="instrumentation", data="trace"); atexit.register(_proton_finalize_and_postprocess)pl.enable_semantic("triton"))!torch_guardsTracingContexttry_getaot_graph_namer   r"   r  r  	_inductortest_configstrack_memory_lifecycler  r   r!   r   r  torch._C._distributed_c10dr&  AttributeErrorImportErrorannotate_trainingr   r   proton_profilingproton_output_dirospathr   r   proton_group_by_smproton_split_invocationsproton_per_cta_occupancy)r  contextaot_config_commentinductor_debug_utilsr&  
output_dirproton_name
trace_pathgroup_by_smsplit_invocationsper_cta_occupancys              rV   r  !PythonWrapperCodegen.write_header  s7   --..6687#9#9#E#-g.D.D-E!F!v""CCDqH#w __##00GG#l #$ % $,,- .%& '!$ ' 	 	
* 	 ! 	 	
$	 DKK 	   ##KK!!"?@==)))KK!!"DEKK!!"IJKK!!U KK!!/2KK!!/2KK!!+.KK!!G  88 BGGLL=J KK!!M*=N"OP*:,nEK)*5PQJ --::K & F F & F FKK!!p KK# $., /) *5 6//@.A B//@.A B KK!!  +} -DD
 KK!!"@A[ *	 , 		s   8 M& &M:9M:c                    g rJ   rz   )r  r  s     rV   include_extra_header)PythonWrapperCodegen.include_extra_header      rX   c                    U R                   R                  S[        R                   S35         SSKJn  U R                   R                  SSS9  g ! [        [        4 a     g f = f)Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            r   )_cuda_getCurrentRawStreamzU
                get_raw_stream = torch._C._cuda_getCurrentRawStream
                Tr   )r  r   r!   r   torch._CrK  r3  r2  )r  rK  s     rV   r  6PythonWrapperCodegen.write_kernel_autotune_defs_header  sw    !!((
 $,,- .	

	:%%,, 	 -  ^, 		s    A A#"A#c                   S[         R                   S3n[        R                  R                  (       a]  U R
                  R                  U5        U R
                  R                  [        R                  R                  R                  S5      5        [        R                  R                  (       d]  U R                  R                  USS9  U R                  R                  [        R                  R                  R                  S5      5        g g )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r'   r   r"   r   r   r   r   r   r5   rK   ru  import_get_raw_stream_asrs  r  r  
import_strs     rV   write_triton_header_once-PythonWrapperCodegen.write_triton_header_once.  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rX   c                   [         R                  R                  R                  S5      n[        R
                  R                  (       a;  U R                  R                  U5      (       d  U R                  R                  U5        [         R                  R                  (       d=  U R                  R                  U5      (       d  U R                  R                  U5        g g g )NrO  )r5   rK   ru  rP  r"   r   r   r   containsr   rs  r  )r  import_get_raw_stream_strs     rV   write_get_raw_stream_header0PythonWrapperCodegen.write_get_raw_stream_header@  s    $%GG$6$6$O$O%
! ==11--667PQQ**445NOww""<<(()BCC&&'@A D #rX   c                $    U R                  5         g rJ   )rX  r  s    rV    write_get_raw_stream_header_once5PythonWrapperCodegen.write_get_raw_stream_header_onceK  s    ((*rX   c                   [        U5      nXR                  ;  a  S[        U R                  5       3nX R                  U'   U R                  R	                  U SU 35        [
        R                  R                  (       a;  U R                  R	                  U SU 35        U R                  R                  U5        U R                  U   $ )Nmetar   )reprr  r   r  r   r"   r   r   r   r  r   )r  r^  vars      rV   add_meta_once"PythonWrapperCodegen.add_meta_onceO  s    Dz{{"T[[)*+C #KKKK!!SETF"34}}55**44uCv5FG##C({{4  rX   c                ~    U R                  5        Vs/ s H  oR                  U R                  5      PM     sn$ s  snf rJ   )get_graph_outputsr  r  r  r   s     rV   get_output_refs$PythonWrapperCodegen.get_output_refs]  s?     =A<R<R<T
<Tq 1 12<T
 	
 
s   $:c                    g rJ   rz   r  s    rV   mark_output_type%PythonWrapperCodegen.mark_output_typec      rX   c                6    [         R                  R                  $ rJ   )r5   rK   r  r  s    rV   get_graph_inputs%PythonWrapperCodegen.get_graph_inputsf  s     ww###rX   c                6    [         R                  R                  $ rJ   )r5   rK   r  r  s    rV   rd  &PythonWrapperCodegen.get_graph_outputsk  s    ww$$$rX   c           
     6   U R                  5       R                  5        H  u  p[        U[        R                  [
        R                  45      (       a  M6  U[        R                  R                  ;  d  [        U[
        R                  5      (       a  Mu  [        UR                  5       5      S:X  a  M  U R                  UR                  5       5      nU R                  UR                  5       5      nU R                  R!                  SU SU SU S35        M     g )Nr   zassert_size_stride(r  r   )rm  r  rb   rs   r   r#   TorchBindObjectr5   rK   graph_input_namesGeneratorStater1   rs  r   rt  r  r   )r  r   bufre   rf   s        rV   codegen_input_size_asserts/PythonWrapperCodegen.codegen_input_size_assertsn  s    ..0668ID#

B,>,>?@@ 177444
R&&9 9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR 9rX   c                n   U R                   R                  S5        U R                  5       R                  5        Hx  u  p[	        U[
        R                  [        R                  45      (       a  M6  SU S3nU R                   R                  U5        SU S3nU R                   R                  U5        Mz     g )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   rm  r  rb   rs   r   r#   rr  )r  r   ru  r   s       rV   codegen_input_nan_asserts.PythonWrapperCodegen.codegen_input_nan_asserts  s    HI..0668ID#

B,>,>?@@ &;<DKK!!$' &;<DKK!!$' 9rX   c                :    U R                   R                  S5        g )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r  s    rV   write_async_compile_wait-PythonWrapperCodegen.write_async_compile_wait  s    	
rX   c                    SR                  U5      n[        U5      S:X  a  US-  nU R                  R                  U S35        U R                  R                  S5        g )Nr  r6   ,z = argszargs.clear())r   r   r  r   )r  input_nameslhss      rV   
write_argsPythonWrapperCodegen.write_args  sP    ii${q 3JCWo.n-rX   c                    [         R                  (       a  U R                  R                  S5        SnU$ U R                  R                  SU R                   S35        SnU$ )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r    z
                def z(args):
                r6   )r"   graph_partitionr  r   r  r  prefix_indents     rV   !write_launcher_fn_call_get_indent6PythonWrapperCodegen.write_launcher_fn_call_get_indent  sm    !!KK M  KK**+ ,
 MrX   c                6    [         R                  R                  $ rJ   )r5   rK   rs  r  s    rV   get_graph_input_names*PythonWrapperCodegen.get_graph_input_names  s    ww(((rX   c                   U R                   c   eU R                  5         U R                  5       nU R                  R	                  U5         [
        R                  R                  (       aA  U R                  R                  [        R                  R                  R                  5       5        [        R                  R                  5       n[
        R                  (       a  U R                  R                  SU S35        U R                  5       =n(       a  U R!                  U5        U R#                  5         [%        5       (       a  ['        U 5      (       a  U R)                  5         S S S 5        g ! , (       d  f       g = f)Nz0training_annotation = nvtx._device_range_start(''))r  r|  r  r  r   r"   r   debug_sync_graphr   r5   rK   ru  synchronizeget_training_phaser4  r  r  codegen_inputsr/   r.   "codegen_input_size_and_nan_asserts)r  r  phasers  s       rV   r   !PythonWrapperCodegen.write_prefix  s	   $$000%%'>>@[[.}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!
 -..<TBB779) /..s   DE..
E<c                    [         R                  (       a  U R                  5         [         R                  (       a  U R	                  5         g g rJ   )r"   size_assertsrv  nan_assertsry  r  s    rV   r  7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rX   c                    U R                  5         SU 3n[        R                  R                  (       aB  U R                  R                  U SU S35        [        R                  R                  (       a  U$ U R                  U SU S35        U$ )Nstream = get_raw_stream(r   )	rX  r"   r   r   r   r   r5   rK   rs  )r  rl  r  r   s       rV   r
  )PythonWrapperCodegen.write_get_raw_stream  s    ((*
|$==11&&00&*:,a8 ww""$1*Q?@rX   c                     U R                   S   $ )N)r  r  s    rV   get_codegened_graph(PythonWrapperCodegen.get_codegened_graph  s    ))"--rX   c                :    U R                   R                  U5        g rJ   )r  rj   )r  rK   s     rV   r9  )PythonWrapperCodegen.push_codegened_graph  s    ""))%0rX   c                6    U R                   R                  5       $ rJ   )r  r  r  s    rV   rd  (PythonWrapperCodegen.pop_codegened_graph  s    ))--//rX   c                P    SSK Jn  U R                  R                  U" U5      5      $ )Nr   )deepcopy)copyr  r  rj   )r  r5  r  s      rV   r4  (PythonWrapperCodegen.push_computed_sizes  s!    !((//0HIIrX   c                6    U R                   R                  5       $ rJ   )r  r  r  s    rV   ra  'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rX   c                .    [        U R                  5       $ rJ   )nextr  r  s    rV   next_kernel_suffix'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rX   c                   U R                  [        XR                  5      5        [        R                  R
                  (       a  U R                  5         U R                  R                  S[        R                  R                  R                  U5       S35        U R                  R                  5         [        U 5      (       a  U R                  5         U R                  R                  SU SU S35        Xl        g )Nrq  rr  r  r  r   )r   rk  rn  r"   r   r   rS  r   r5   rK   ru  rx  r:  r.   rX  )r  rl  s     rV   codegen_device_guard_enter/PythonWrapperCodegen.codegen_device_guard_enter  s    )*6W6WX	
 ==11))+&&00**77
CDAF &&0022488002&&00$6zl!D -7)rX   c                    U R                  [        5       5        [        R                  R                  (       a  U R
                  R                  5         g g rJ   )r   r  r"   r   r   r   re  r  s    rV   codegen_device_guard_exit.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rX   c                   U(       Ga1  [         R                  (       a  U R                  R                  SSR	                  U5      -   S-   5        U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  S5        U R                  R                  S5        U R                  R                  S	SR	                  U5      -   S-   5        g U R                  R                  S
5        g )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r    zreturn (z	return ())r"   r  r  r   r   r:  re  )r  output_refss     rV   generate_return$PythonWrapperCodegen.generate_return  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rX   c                    g rJ   rz   r  results     rV   generate_before_suffix+PythonWrapperCodegen.generate_before_suffix)  rk  rX   c                    [         R                  (       aO  SR                  U R                  5      [	        U R                  5      S:X  a  SOS-   nUR                  SU S35        g g )Nr  r6   r  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r"   r  r   all_partition_namesr   r   )r  r  all_partition_name_lists      rV   generate_after_suffix*PythonWrapperCodegen.generate_after_suffix,  se    !!&*ii0H0H&I43349r'# MM--D,E F "rX   c                    g rJ   rz   r  s     rV   generate_end!PythonWrapperCodegen.generate_end:  rk  rX   c                    UR                   nUb3  U[        ;   a)  [        U   R                  nUb  U" XR                  5        g U R                  [	        X5      5        g rJ   )r  r>   pythonr   r  )r  rS   op_namecustom_codegens       rV   generate_fallback_kernel-PythonWrapperCodegen.generate_fallback_kernel=  sR    ))7.J#J9'BIIN)t^^4,T89rX   c                    UR                  U 5        U R                  [        X5      5        [        UR                  [
        R                  5      (       a  UR                  U 5        g g rJ   )codegen_commentr   r  rb   r6  r#   Layoutcodegen_size_assertsr  s     rV   generate_extern_kernel_alloc1PythonWrapperCodegen.generate_extern_kernel_allocG  sJ    T",T89dkk299--%%d+ .rX   c           
        [        UR                  [        R                  5      nUR	                  5       nUR                  5       nUR                  5       nU R                  n[        R                  (       a  SU;   a  SU 3nU(       a5  U R                  U R                   U SSR                  U5       SU 35        g U R                  U R                   U SU SSR                  U5       SU 35        U R                  (       aR  [        R                  (       a<  Ub8  [        S   S==   S	-  ss'   U R                  S
UR                   < SU S35        g g g g )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr6   zrun_intermediate_hooks()rb   r6  r#   
NoneLayoutrM   get_origin_noder  r  r"   memory_planningr   r  r   r  generate_intermediate_hooksr   r   )r  extern_kernelr  	no_returnr  origin_noder  r  s           rV   r  9PythonWrapperCodegen._generate_extern_kernel_alloc_helperM  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rX   c                Z    UR                  U 5        U R                  [        X5      5        g rJ   )r  r   r  r  s     rV   generate_extern_kernel_out/PythonWrapperCodegen.generate_extern_kernel_outj  s$     	T"*467rX   c                2   [         R                  R                  R                  nUR	                  XAS S S5        UR                  SU(       a  UOU 35        U   U R                  U SSR                  U5       S35        S S S 5        g ! , (       d  f       g = f)Nexternzout=r  r  r   )r5   rK   wrapper_coder  set_printer_argsrj   r   r   )r  r   outout_viewr  r  stack_tracesdebug_printer_managers           rV   r  7PythonWrapperCodegen._generate_extern_kernel_out_helperq  sw     !" 4 4 B B..tT4Rdx8S9:;"NNfXQtyy&7q9: #""s   'B
Bc                  ^  UR                   nUR                  nU(       a$  [        S U 5       5      n[        S U 5       5      nUR                  R	                  5        S3nSR                  U 4S jU 5       5      nSR                  U 4S jU 5       5      n[        R                  T UR                  5      nSnU SUR                   S	3nU SU SU SU 3n	U S
U	 S3n
U
$ )Nc              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frJ   r5   rK   rQ   atomically_apply_size_hintr{   r  s     rV   r}   RPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>  s*     VQUA))DDQGGQU   68c              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frJ   r  r  s     rV   r}   r    s+      HR1  ;;A>>
r  z.data_ptr()r  c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frJ   r2  val_to_arg_strr{   dimr  s     rV   r}   r    s$     XSWC-<<T3GGSW   #&c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frJ   r  r  s     rV   r}   r    s%      
FPs //c::jr  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsr   tensorr  r   r2  r  element_sizerank)r  descapply_size_hintsr  r  ptrr   r  r   r  r!  s   `          rV   *_generate_tma_descriptor_call_experimental?PythonWrapperCodegen._generate_tma_descriptor_call_experimental  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArX   c                    UR                   nU(       a  [        S U 5       5      nSnU S3nUR                  R                  5        SU 3nU SU S3nU$ )Nc              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frJ   r  r  s     rV   r}   LPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>  s+       HS1  ;;A>>r  z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaper   r  r  )r  r  r  r
  r  r   r  r!  s           rV   $_generate_tma_descriptor_call_stable9PythonWrapperCodegen._generate_tma_descriptor_call_stable  so    &&  HS  K Cx|$++//12"[MBQtfArX   c                    [        U[        R                  5      (       a  U R                  X5      $ [        U[        R                  5      (       d   eU R                  X5      $ rJ   )rb   r#   TMADescriptorExperimentalr  TMADescriptorStabler  )r  r  r  s      rV   _generate_tma_descriptor_call2PythonWrapperCodegen._generate_tma_descriptor_call  sW    dB8899BB  dB$:$:;;;;<<TTTrX   c                    U R                  U5      nUR                   SU U R                   3nU R                  U5        g Nr   )r  r   r  r   )r  r  r!  r   s       rV   generate_tma_descriptor,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trX   c                8    U R                  [        X5      5        g rJ   )r   r  r  s     rV   generate_scatter_fallback.PythonWrapperCodegen.generate_scatter_fallback  s    *467rX   c	                   U SSR                  [        [        U5      5       3n	UR                  S5      (       a  U	SR                  S/U-   5      -  n	OU(       a  U	S[	        U5       3-  n	U	S-  n	U R                  U	5        g )Nr  r  zaten.scatter_reducer  r  z	, reduce=r   )r   mapr   r   r_  r   )
r  r   r  r  r  r  r%  r   r  r   s
             rV   r  /PythonWrapperCodegen._generate_scatter_fallback  s~     %%QsxxC0@'A&BC(()>??DIIrdVm,,D)DL>22trX   c                n   / nUR                   SS  n[        U5      n[        UR                  5       He  u  pVUR                  U   b?  [	        U5      n[        U[        R                  5      (       d   eUR                  U5        MT  UR                  S 5        Mg     U R                  [        XU5      5        g )Nr    )r  iter	enumerater  r  rb   r#   r%   rj   r   r  )r  rS   r  valid_indicesiter_valid_indicesr  _r  s           rV   generate_index_put_fallback0PythonWrapperCodegen.generate_index_put_fallback  s    -/AB!-0dll+DA||A*/0!%3333u%t$ , 	+D@ArX   c                v    SSR                  U5       S3nX&XE/nU R                  U R                  X5      5        g )Nr  r  r  )r   r   wrap_kernel_call)r  r   r   r  r  
accumulateindices_strr  s           rV   r  1PythonWrapperCodegen._generate_index_put_fallback  s;    $))G,-Q/3t,,V:;rX   c           
     `    U R                  U SU SSR                  U" 5       5       S35        g )Nr   r  r  r   )r   r   )r  buf_namer  get_argsop_overloadr  r  s          rV   ,generate_fallback_kernel_with_runtime_lookupAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrX   c                p    [        S5         U R                  U5      sS S S 5        $ ! , (       d  f       g = f)NzPythonWrapperCodegen.generate)r   	_generater  is_inferences     rV   generatePythonWrapperCodegen.generate  s#    9:>>,/ ;::s   '
5c                0    [         R                  (       a  gg)Nr    r6   )r"   r  r  s    rV   get_wrapper_call_indent,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rX   c              #  \   #    U R                   n Xl         Uv   X l         g ! X l         f = f7frJ   r   )r  newolds      rV   set_writeline"PythonWrapperCodegen.set_writeline  s'     nn	! NI NSNs   ,
! ,),c                    U R                   R                  n[        R                  R                  (       a  U R
                  R                  U5        g U R                  R                  U5        g rJ   )r  kernel_defsr"   r   r   r  r   r  )r  r?  s     rV   _write_multi_kernel_defs-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+rX   c                
   [         R                  (       a  U R                  5         [        R                  " 5        nUR                  U R                  R                  5       5        [         R                  (       a  U R                  U5        [         R                  (       a  U R                  5         U R                  U5        [         R                  R                  (       a/  [         R                  R                  (       d  U R                  5         U R!                  U R                  R"                  5         U R$                   HP  n['        U[(        5      (       a  UR+                  U R                  5        M5  U R                  R#                  U5        MR     S S S 5        U R-                  5         U R/                  5       nU R1                  5         [         R                  R2                  (       aA  U R                  R#                  [4        R6                  R8                  R;                  5       5        [         R                  (       a  U R=                  5         [         R                  R>                  (       a  U RA                  5         [         R                  R                  (       a/  [         R                  R                  (       d  U RC                  5         [         R                  R                  (       a  U RE                  5         [         RF                  (       a0  [         RH                  (       d  U R                  R#                  S5        U RK                  U5        S S S 5        [M        5       nURO                  U RP                  5        UR#                  S5        URO                  U RR                  5        [4        R6                  RT                  (       aH  [4        R6                  RH                  (       a)  [4        R6                  RV                  (       a
  [M        5       nURO                  U RX                  5        U R[                  5         URO                  U R\                  5        U R_                  5       nUR                  U5         URO                  U R                  5        S S S 5        U Ra                  U5        URO                  U Rb                  5        U Re                  U5        U Rg                  U5        U Ri                  U5        URk                  5       U Rl                  Rk                  5       4$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= f)Nz+nvtx._device_range_end(training_annotation)r  )7r"   profile_bandwidthrS  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesr   store_cubinr   !generate_reset_kernel_saved_flagsr<  r   r  rb   rH   r=  r@  rf  ri  r  r5   rK   ru  r  generate_end_graphr5  generate_proton_finalize generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr4  rs  r  r-   r   r  r  rt  is_const_graphr  finalize_prefixr  r6  r  r  r  r  add_benchmark_harnessgetvaluewithlinemapr  )r  r2  stackr   r  r  wrapper_call_indents          rV   r0  PythonWrapperCodegen._generate  si   ##))+!!#u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ JJD!$44T%6%67))33D9 ' A ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}----/}}((1W1W557}}55446 ''0B0B!!++A   -] $b  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./MM$++, 0 	##F+dkk"""6*&!""6* &&($$88:
 	
} A@ $#B 0/s2   C9T:4A!T(F?T:;U(
T7	2T::
U	
Uc                |  ^ U R                   R                  S5        0 n[        R                  R                  (       af  [
        R                  R                  (       aG  [        [
        R                  R                  5       VVs0 s H  u  p#U R                  U5      U_M     nnnU R                   R                  5       S-   U R                  R                  5       -   m[        R                  [        R                  :X  aj  [         R"                  " [%        5       SSS9 nUR'                  TR)                  S5      5        UR*                  nSSS5        [        R,                  " SW5        [/        S	S
 U4S jS9   [1        TU5        gs  snnf ! , (       d  f       NI= f! [2         a  n[5        SU 35      UeSnAff = f)z
Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
code and execute it to trigger Triton kernel compilation and auto-tuning
zQ
            async_compile.wait(globals())
            del async_compile
        rp  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sartifactc                     SSS.$ )N&inductor_autotune_at_compile_time_codestring)r   encodingrz   rz   rX   rV   r   FPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>{  s    @$!rX   c                    > T $ rJ   rz   )tuning_codes   rV   r   r_    s    {rX   )metadata_fn
payload_fnz%Failed to run autotuning code block: )r  r   r"   r   r   r5   rK   autotuning_inputsr  get_autotuning_input_namer   r   r$   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder   debugr   exec	ExceptionRuntimeError)r  scoper  vf	file_pathera  s          @rV   rO  4PythonWrapperCodegen.generate_and_run_autotune_blockW  s   
 	!!((	
 ==11agg6O6O ((A(ABBFC ..s3Q6B  
 %%..0((1134 	
   GMM1 ,,Ke**734FF		
 !!0 	 +	
	Se$? (  	S!FqcJKQRR	Ss*   F-F9F 
F
F;'F66F;c                \    SSK Jn  U" U 5      R                  U R                  5      U l        g )Nr6   )MemoryPlanner)r  rx  r  r  )r  rx  s     rV   memory_plan PythonWrapperCodegen.memory_plan  s     2"4(--djj9
rX   c                D   U R                  5       n[        R                  R                  U5      nU R                  (       a  [        U R                  S   [        5      (       a  U R                  S   R                  R                  U;  av  U R                  R                  5         U R                  (       aK  [        U R                  S   [        5      (       a)  U R                  S   R                  R                  U;  a  Mv  [        5       /n/ n[        [        U R                  5      5       H  nU R                  U   n[        U[        5      (       a#  UR                  US   5      U R                  U'   MJ  [        U[        5      (       a  UR                  [        5       5        Mz  [        U[         5      (       d  M  UR                  UR                  5       5        M     UR                  UR                  5       5        [        U5      S:X  d   e[#        S U 5       5      ng )Nr  r   c              3  8   #    U  H  oR                   v   M     g 7frJ   )r  )r{   ss     rV   r}   9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>  s      +
3Ga))3Gs   )rd  r5   rK   _get_output_namesr  rb   r  rS   r   r  r
  ranger   r  r1  rj   r_  sum)r  r  	out_namesplanning_statespast_planning_statesr  r   _total_allocated_buffer_sizes           rV   memory_plan_reuse&PythonWrapperCodegen.memory_plan_reuse  s   ((*GG--g6	 JJ4::b>+=>>

2##((	9 JJNN JJ4::b>+=>>

2##((	9 /01!s4::'A::a=D$ 233 $		/"*= >

1D"344&&':'<=D"233$++O,?,?,AB ( 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rX   c                    U(       a&  [         R                  (       a  U R                  5         g [         R                  (       a  [	        5       U l        U R                  5         g rJ   )r"   r  ry  r!  r  r  r  r1  s     rV   rI  *PythonWrapperCodegen.run_wrapper_ir_passes  s9    F22((%:%<"""$rX   c           	       ^	 U R                   m	[        R                  U	4S j5       n[        R                  U	4S j5       n[        U[        R
                  5      (       aM  [        U[        R                  5      (       a  X#;   a  g T	R                  U SU 35        UR                  U5        g [        U[        R                  5      (       a  [        UR                  5       5       H^  u  pg[        U[        R                  5      (       d  M&  Xs;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     [        UR                  5       5       H^  u  ph[        U[        R                  5      (       d  M&  X;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     g [        U[        R                  5      (       a  g [        U[        R                  5      (       a  g [         R"                  R$                  R&                  (       a  g [)        S[+        U5       35      e)Nc                <   > TR                  U  SU  S35        U  S3$ )Nz_size = z.size()_sizer9  r   r<  s    rV   sizeofDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s(    NNdV8D69:V5>!rX   c                <   > TR                  U  SU  S35        U  S3$ )Nz
_stride = z	.stride()_strider9  r  s    rV   strideofFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s)    NNdV:dV9=>V7##rX   r   r  r  zUnknown value type: )r  r$  r  rb   rs   r   Symbolr   r   r#   rh   r  rs  rt  rr  rt  r)  r.  r"   r  ra  r   )
r  r   r   
bound_varsr  r  r  re   rf   r<  s
            @rV   codegen_input_symbol_assignment4PythonWrapperCodegen.codegen_input_symbol_assignment  s    {{		" 
	" 
	$ 
	$ eUZZ((eU\\22e6INNeWCv./NN5!r||,,&u~~'78	dELL11d6LNNdV3vd|nAcU!#DENN4( 9  ))9)9);<fell338PNNfXS$0@#a#HINN6*  = r1122r0011%%55$';DK=%IJJrX   c           	     F   [         [        R                     " 5       nU R                  5       nUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       d  M&  X44PM+     snnUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       a  M&  X44PM+     snn-   nU H  u  pgU R                  XgU5        M         SS jnU H.  u  p[        U[        R                  5      (       d  M&  U" Xq5        M0     gs  snnf s  snnf )z$Assign all symbolic shapes to localsc                x   [         R                  " U R                  5       U R                  5       /5       H}  n[	        U[
        5      (       a  [	        U[        R                  5      (       a  M9  UR                   Vs/ s H  o3U;  d  M
  UPM     nn[        U5      S:  d  Mm  [        SU SU S35      e   g s  snf )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterablers  rt  rb   r   rs   r  free_symbolsr   ra  )r   r  exprsymundefined_symbolss        rV   _verify_input_symbol_assignmentLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS!$--D%,,1O1O $(#4#4%#4C:8MC#4 " % ()A-(tfK0A/BB[\  T%s   <	B7	B7N)r   ir.TensorBoxr  OrderedSet[sympy.Symbol])	r   rs   r  rm  r  rb   r  r#   rh   )
r  r  r  krr  r  r   r   r  r!  s
             rV   r  #PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
3tqz!U\\7RFQF3
 , 2 2 4X 4Jq%,,<WVaV 4XY "KD00jI "		0	& HAeR\\22+E> 3
Xs    %D)D%D/Dc                   [        U[        R                  5      (       a  [        U[        R
                  5      (       a  XR                  ;   a  g U R                  R                  U5        [        R                  R                  R                  U   n[        X5      nU R                  [        X[        R                  5      5        g g g rJ   )rb   rs   r  r   r   PRECOMPUTED_SIZEr5  r   r5   rK   rQ   inv_precomputed_replacementsr   r   r  )r  r  r  r  s       rV   ensure_size_computed)PythonWrapperCodegen.ensure_size_computed
  s    c5<<((^CAVAV-W-W)))##C(77##@@ED!#,CNN.t!''BC .X(rX   c                    g rJ   rz   r  s    rV   rQ  $PythonWrapperCodegen.finalize_prefix  rI  rX   rR   c                   [        S5      e)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)rp  r  r   rR   s      rV   codegen_cpp_sizevar(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrX   c                   [        XS9$ )Nr  )pexprr  s      rV   codegen_python_sizevar+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rX   c                $    U R                  U5      $ rJ   )r  re  s     rV   codegen_sizevar$PythonWrapperCodegen.codegen_sizevar  s    **1--rX   c                    U SU S3$ )Nr  r  rz   )r  r  r   r  s       rV   r  )PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rX   c                    / [        U R                  U5      Qn[        U5      S:X  a  g[        U5      S:X  a	  SUS    S3$ SSR                  U5       S3$ )Nr   ()r6   r  r  r  r   )r  r  r   r   )r  rv  partss      rV   r   /PythonWrapperCodegen.codegen_python_shape_tuple"  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rX   c                $    U R                  U5      $ rJ   )r   )r  rv  s     rV   r{  (PythonWrapperCodegen.codegen_shape_tuple*  s    ..u55rX   c                    SR                  SR                  U[        U5      [        U5      U R	                  U5      U R	                  U5      /5      5      / 4$ )Nzalloc_from_pool({})r  )formatr   r  r   r   )r  r   rg   r   rv  rf   s         rV   codegen_alloc_from_pool,PythonWrapperCodegen.codegen_alloc_from_pool-  s_     %++II&MJ33E:33F;

 
 
	rX   c                  ^  [        U5      u  pxpnU 4S jnUR                  5       nU=(       a    XI:H  nU(       a  X':H  =(       a    X8:H  nU
nObX!R                  R                  :H  =(       a7    X1R                  R                  :H  =(       a    XAR                  R
                  :H  nUR                  nU(       a  Ub  UU:w  a	  SU SU S3$ U $ U" XX4UU5      $ )Nc           	        > T
R                  U5      nT
R                  U5      nT
R                  U5      nSU  SU SU SU S3	n	Ub  XE:w  a	  SU	 SU S3$ U	$ )Nzreinterpret_tensor(r  r   aten.view.dtype()r   r  )r   tgt_size
tgt_stride
tgt_offset
cast_dtype
base_dtyper}  stoffr  r  s             rV   apply_reinterpretHPythonWrapperCodegen.codegen_reinterpret_view.<locals>.apply_reinterpret[  sz     //9A00<B&&z2C(b2bTC5BD%**B)$r*Q??KrX   r  r  r   )rp   rM   r6  re   rf   rg   r   )r  rk   re   rf   rg   r   r   d_sized_strided_offsetd_dtypecollapsibler  r   	collapsedsame_layoutr  s   `                rV   codegen_reinterpret_view-PythonWrapperCodegen.codegen_reinterpret_view<  s    8 ,D1 	9([		 }}6F$6	.?V-?K J ((( 1kk0001kk000 
 J Uj%8)$r%::V VUJOOrX   c                8    U R                  U SU SU S35        g )Nz.copy_(r  r   r9  )r  r   dstnon_blockings       rV   codegen_device_copy(PythonWrapperCodegen.codegen_device_copyz  s!    #gcU"\N!<=rX   c                    UR                  5       nUR                  S5      nU R                  [        XX1R                  5      5        g r  )rM   
input_namer   r  r  )r  rS   r  r  s       rV   codegen_multi_output)PythonWrapperCodegen.codegen_multi_output}  s2    mmo??1%t(LLQRrX   c           
     t   UR                    SUR                   SUR                    SUR                    3nU(       a  SUR                   SU S3nU R                  UR                   SUR                   SUR
                   SU S	35        U R                  R                  [        UR                  5      5        g )
N +  if z
 < 0 else zmax(0, min(r  z))r   z * (r   )	r  re   r   unbacked_offset_symbolbase_offsetbase_dim_strider  r   r   )r  rS   clamp	index_strs       rV   codegen_dynamic_select_index1PythonWrapperCodegen.codegen_dynamic_select_index  s    zzl#dii[TZZL
4::,W	%dii[9+R@I**+3t/?/?.@DDXDXCYY]^g]hhij	
 	""&&s4+F+F'GHrX   c                  ^ ^ UU 4S jnU 4S jnTR                   nU" TR                  5      nU" TR                  5      nT R                  U SU 35        T R                  U SU 35        U" U S3U S3TR                  5      nT R                  U SU S35        T R
                  R                  [        TR                   5      5        g )	Nc           	     h  > TR                  [        R                  " S[        R                  " U TR                  5      5      5      nTR                  [        R                  " S[        R                  " U TR                  -   TR                  5      5      5      nTR                  U 5      nU SU SU 3$ )Nr   r  z >= 0 else )r  rs   MaxMinre   )r   posnegx_condrS   r  s       rV   clamp_indexDPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.clamp_index  s    &&uyyEIIa4K'LMC&&		!UYYq499}dii@AC ))!,FU$vhk#77rX   c                \   > US:X  a  U SU  3$ TR                  U5      nSU SU  SU SU 3$ )Nr6   z - r  r  z	 - 1) // )r  )	start_varend_varstepstep_r  s       rV   codegen_with_stepJPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.codegen_with_step  sJ    qy!#i[11((.Ewis9+SyHHrX   z	_start = z_end = _start_endz
 = max(0, r   )unbacked_size_symbolstartendr   r  r  r   r   )r  rS   r  r  r  r  r   	with_steps   ``      rV   codegen_dynamic_slice_size/PythonWrapperCodegen.codegen_dynamic_slice_size  s    	8	I ''DJJ'$((##iw/0#gcU+,%VnTlDIIN	#j156""&&s4+D+D'EFrX   c                8    U R                  [        X5      5        g rJ   )r   rW  r  s     rV   codegen_dynamic_scalar+PythonWrapperCodegen.codegen_dynamic_scalar  s    (45rX   c                   S UR                    5       u  n[        UR                  5      S:X  a#  U R                  UR                   SU S35        GOw[        UR                  5      S:X  aE  [        UR                  S   [        5      (       a#  U R                  UR                   SU S35        GO[        UR                  5      S:X  a  [        UR                  S   [        5      (       a  U R                  UR                   SU S35        U R                  S	UR                   S
UR                  S   R                   SUR                   SUR                  S   R                   S3	5        U R                  UR                   SUR                   SUR                  S   R                   35        O[        SUR                   35      eU R                  UR                  5        S35        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7frJ   r  r  s     rV   r}   ?PythonWrapperCodegen._codegen_dynamic_scalar.<locals>.<genexpr>  s     >+Q&&((+r  r   r   .item()r6   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath  = None)r  r   keypathr   r  rb   r   r   divisorra  rM   )r  rS   rk   s      rV   rY  ,PythonWrapperCodegen._codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(R(RNNdhhZxv^DE!#
4<<?K(P(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rX   c           
     r  ^ ^ UU 4S jnU4S jnU4S jnTR                  / SQ5        TR                  5          TR                  SSS9  [        R                  R
                  R                  5        HT  u  pVTR                  SU 35        U" XVR                  5       UR                  5       UR                  UR                  5        MV     [        [        R                  R                  5      S	:  a^  TR                  S
5        [        R                  R                  R                  5        H!  u  pWTR                  SU 35        U" XW5        M#     [        R                  R                  R                  5        GH9  u  pV[        U[         R"                  5      (       aI  [        [        R                  R$                  R&                  R)                  US 5      [*        5      (       a  Mn  [        U[,        R.                  5      (       ad  [        [        R                  R                  5      S	:X  a  TR                  S
5        TR                  SU 35        U" XVR1                  5       5        M  [        U[         R2                  5      (       a2  U" U[        R                  R$                  R5                  USS95        GMB  [        U[,        R6                  5      (       a$  U" USUR                  R8                   S35        GM  UR;                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n	nUR=                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n
nU" UU	U
UR?                  5       URA                  5       5        GM<     SSRC                  [        R                  R                  RE                  5       5       S3nTR                  SU 35        TR                  S5        S S S 5        g s  snf s  snf ! , (       d  f       g = f)Nc                   > TR                  U  STR                  U5       STR                  U5       SU SU S3
5        g )Nz = rand_strided(r  
, device='	', dtype=r   )r   r   )r   rv  rf   r  r   r   r  s        rV   add_fake_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rX   c                2   > TR                  U  SU 35        g r  r9  )r   r  r   s     rV   add_expr_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rX   c                   > Uc  TR                  U  S35        g SS Kn[        U[        R                  5      (       d   eTR                  U  SUR                  U5      < S35        g )Nr  r   z = pickle.loads(r   )r   picklerb   r)  ScriptObjectdumps)r   r   r  r   s      rV   add_torchbind_inputKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  sb    }  D6!12eU%7%78888v%5fll56I5LANOrX   )r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   r  ztorch.cuda.default_generators[z].graphsafe_get_state()zcall([r  ])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r5   rK   	constantsr  r   re   rf   r  r   r   torchbind_constantsr  rb   rs   r  rQ   
var_to_valr   r   r#   rr  get_real_objr   r  rt  r  rs  rt  r  rP   r   r  )r  r   r  r  r  r   r   torchbind_objr   rv  rf   call_strs   ``          rV   benchmark_compiled_module.PythonWrapperCodegen.benchmark_compiled_module  sa   		0		P 	K	
 ]]_MM     !ww00668   74&!12**,ekk	  9 177../!3  1+,77+F+F+L+L+N'D $$wtf%56'<	 ,O  !ww3399;eU\\22zGG$$//33E4@,8 8 eR%7%7881776671<((9$$wtf%56'.@.@.BCuzz22
 #4)9)9)C)CETV)C)WXr'8'899"89K9K8LLcd "'!1!1A ((221r2B!1   "'!1!1!3!3A ((221r2B!3   #((*)E  <T  		!''*>*>*C*C*E FGrJH}XJ78WXK _fo _s+   KP(2P P(2P#BP(
P((
P6c                
   [         R                  (       d  gU R                  U5        UR                  / SQ5        UR	                  5          UR                  SS[        5        S3/5        SSS5        g! , (       d  f       g= f)z<
Append a benchmark harness to generated code for debugging
N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r"   benchmark_harnessr)  r"  r   r+   r  r   s     rV   rR  *PythonWrapperCodegen.add_benchmark_harness	  sh     ''&&v.@A]]_X,-?-A,BB_` __s    A44
Bc                >    U R                  [        U UUUUUS95        g r  )r   r  )r  r  r  r  r  r  s         rV   define_kernel"PythonWrapperCodegen.define_kernel0	  s*     	 !-		
rX   c                    [         R                  R                  (       a,  U(       a%  [        R                  " SSU[        R
                  S9nU(       a  U S3OSnSU U  SU 3nU$ )Nz^// z# )flagsrp  r  z

r   )r"   r   r   resub	MULTILINE)r  r  r  metadata_commentbodys        rV   _format_kernel_definition.PythonWrapperCodegen._format_kernel_definitionC	  sZ     ==11h vvgtXR\\JH.6hZr?B&'}C}ErX   c                <   [         R                  R                  (       aR  U(       aK  U R                  XUS9nU R                  R                  U5        [        R                  R                  (       a  g U R                  XUS9nU R                  R                  U5        g )N)r  )
r"   r   r   r9  r  r   r5   rK   rs  r  )r  r  r  r  r  r  r8  s          rV   r  *PythonWrapperCodegen._define_kernel_helperO	  s     ==11c118 2 D %%,,T2ww""--x . 
 	4 rX   c                N    U R                   R                  UR                  5        g rJ   )r  r   r   )r  r   subgraph_codes      rV   define_subgraph_launcher_fn0PythonWrapperCodegen.define_subgraph_launcher_fne	  s    !!(()<)<=rX   c                (  ^^/^0^1^2^3^4 SSK JnJnJn	  SSKJm/Jn
JnJnJ	n  SSK
JnJn  UR                  n/ m40 m2/ m1/ nU1U44S jm0S.U/U0U2U4S jjnUR                   Vs/ s H  nUR                  PM     nnUR                   Vs/ s H"  nUR                   (       d  M  UR"                  PM$     nn[%        U5       GH  u  nnUU;   a  U" UT/" US9S	S
9  M  UT;  a  M#  TU   nTU   c  U" UT/" US9S	S9  M<  ['        U[(        R*                  5      (       a^  ['        U[(        R,                  5      (       a'  SUR.                  UR0                  R3                  5       4OSu  nnnU" UU" UUUUS95        M  ['        U[(        R4                  5      (       a.  U" UU" UUR7                  5       UR3                  5       S95        GM  ['        U[(        R8                  5      (       aM  U" UU" UUR:                  R7                  5       UR3                  5       UR<                  R>                  S95        GMr  ['        U[@        [B        RD                  45      =(       a*    [F        RH                  RJ                  RM                  US5      nU" UU" UU5      US9  GM     [O        T4S T1URP                   Vs/ s H  n[S        U5      PM     snS9nU[T        RV                  " [F        RH                  RY                  5       5      0 T2E[Z        R]                  US5      E[_        T4T1S9/S.nU(       a  [a        U5      US'   U(       a  [a        U5      US'   [c        U5      S:X  a0  URe                  5       n / [g        [B        Rh                  US   5      Qn!OS/U34S jjn"0 m3U V#s/ s H  n#/ [g        U"U#5      QPM     nn#U(       a  [c        U5      [c        U5      :X  d   e/ n$[k        [m        Xb5      S S	S9 HR  u  n#n%U$Ro                  U" U%5      / [g        [p        U#5      Q/ [g        [r        U#5      Q/ [g        [p        U#5      QS.5        MT     U	R                  U$/ [g        [t        T3Rw                  5       5      QS.n / T3Ry                  5       Qn![{        UR|                  5      /n&[c        U5      S:  aY  TRw                  5        HE  n['        U[(        R4                  [(        R8                  45      (       a  M4  U&Ro                  U5        MG     U&Ro                  [u        U5      5        U&R                  [u        U 5      5        [a        U&5      n&U&U R                  ;   a  / U R                  U&   QU!P7$ U S[c        U R                  5       3n'[        5       n([        R                  R                  (       a  U(R                  SU'< S35        OU(R                  SU< S35        U'U S '   U R                  UR                  5       5        U(R                  U" 5       5        [        R                  R                  (       a  U(R                  S!5        U(R                  S"/ [g        Xr5      Q< S#U < S$U< S%35        [        U5      n)[        R                  R                  (       a  U)R                  S&U S'3S&U' S'35      n)U)R                  S(S)5      n)U(R                  U)5        [F        RH                  RY                  5       n*U(R                  S*U*R                   S+35        [        R                  " UR|                  5      u  n+n,[        R                  " UR|                  5      n-S,U- S-U, 3n.U R                  U'U(R                  5       U.5        U'U4U R                  U&'   U'UU!4$ s  snf s  snf s  snf s  sn#f )0Nr    )config_to_dict	FixedGridPrecomputedGridr6   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J   > TR                  U5        TR                  U 5        g rJ   )rj   )r  r  arg_indices	signatures     rV   add_to_signaturePPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature	  s    S!s#rX   c                  > U(       aE  [        5       (       a  T" X5        UR                  T;   a  TUR                     TUR                  '   g g UR                  T;   d   eU(       a?  [        5       (       a  T" U T" UR                  S95        OT" X5        STUR                  '   g U(       a6  [        5       (       a  T" U T" UR                  S95        S TUR                  '   g T" X5        g )Nr   r6   )r4   r   )	r  r  is_constexprequals_1equals_nonerE  rO  r#  r   s	        rV   add_argGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg	  s    133 %S.88v% +1*:Ichh' & xx6)))577
 )l.IJ(2*+Ichh' 577 )l.IJ*.Ichh'$S.rX   rR  T)rS  )rU  stable)experimentalNN)r   api_typer
  r   )r   bufferr   )r   r[  r   rg   )rT  )
size_dtyper  argdefs)r  )rN  r  r#  r   restore_valuereset_to_zeror   c                t  > [        U [        R                  5      (       al  / U R                  QnU(       d  U $ UR	                  [
        S9  U H0  nUT;   a  M  [        R                  " S[        T5       35      TU'   M2     [        U T5      $ [        U [        5      (       d   e[        R                  " U 5      $ )N)r   _launcher_s)rb   rs   r   r  sortr   r  r   r3   r   rt   )r  symbolsr  extra_launcher_argss      rV   rename_sizes_for_launcherYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher
  s    dEJJ//2 1 12G"#LLSL)&"55$38<<)#.A*B)CD4+C0  ' &d,?@@!$,,,,}}T**rX   c                2    [        U S   R                  5      $ r   r   r   s    rV   r   HPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>(
  s    3qt{{3CrX   r   )r"   r  cpppython_slow)	grid_typeprecomputed_gridsrd  r!  zasync_compile.triton(z, '''r  r(  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r  z# Original path: rr  )FFF)r  r   r   r   )Qruntime.triton_heuristicsrB  rC  rD  commonrE  rF  rG  rH  rI  r   rJ  rK  r   paramsr   rS  numr  rb   r#   TMADescriptorr  r
  r  rP   rc   rM   r&   rk   r6  rg   r   rs   rt   r5   rK   rQ   statically_known_equalsrA   	arg_namesr7   r(   r  get_current_device_or_throwr  fromkeysr?   r   r   setup_grid_as_argsr  sympifyr   r   rj   r  r=   r   r  r  idr   extendr  r-   r"   unique_user_kernel_namesr   updateinductor_meta_commonr   r5  r   replacer   inspectgetsourcelinesgetsourcefiler0  r   )5r  r   r   r   restore_value_argsreset_to_zero_argsr   rB  rC  rD  rF  rG  rH  rI  rJ  rK  original_nameequal_to_1_argsrV  prs  
constexprsr  r   r  rZ  r
  r   rT  r   triton_signaturer  inductor_metaextra_launcher_call_argsre  r   rl  cfg	cache_keyr   r   
kernel_srccurrent_devicer!  linenosrcfiler  rE  rO  rM  r#  rd  rN  s5      `                                           @@@@@@rV   !define_user_defined_triton_kernel6PythonWrapperCodegen.define_user_defined_triton_kernelh	  s   	
 	

	
 	
 	D)+	$&	!#%'	$"	/ "	/H &,]]3]QVV]	3%+]]E]anneaee]
E!),HCj \s3$G& +Cc{"\s3Fc2#3#344 &c2+A+ABB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY//!!$#&<<>"%--/  R%7%788 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs -v -)/)9)9:)9AWQZ)9:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<EOSZ3w<777 "#E#)CT	c "(("0"5"5Ct$4"52UD!12':UD)9':	 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}}!#		23E3E'FGG$$S) ' 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;==))%%&DE83~78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::S 4E@ ;j Ps   ^ ^^%^
^c                    U SUR                    S3nUb  USU 3-  n[        R                  " USSS9n[        XRR                  5      nUS:H  nU(       d)  U R                  [        X[        R                  5      5        U$ )Nr!  rc  T)
is_integeris_positiver  )	r  rs   r  r   rc  r   r  r5   rK   )r  r  treer  sym_namer  r  is_benchmark_kernels           rV   generate_numel_expr(PythonWrapperCodegen.generate_numel_exprx
  s{    !]!DKK=6!F8$Hll8$G c::.)R/"NN.t!''BC
rX   c                j    U R                  UR                   S[        UR                  5       35        g r  )r   r   r  r   )r  r  rK   s      rV   r  7PythonWrapperCodegen._generate_symbolic_call_arg_helper
  s)     	#))Ccnn(='>?@rX   c                   UR                  5       n[        X5      nUR                  [        R                  :X  a  U R                  U5        GOBUR                  [        R                  :X  a2  U R                  U5        U R                  U R                  U5      5        OUR                  [        R                  :X  a  U R                  R                  U5      nU(       a]  [        U[        5      (       a  [        UR                  [        5      (       d   e[        R                  " UR                  U5      Ul        OUU R                  U5        U R                  U R                  U5      5        X0R                  U'   O[        UR                  5      e[         R"                  R$                  (       a  U R&                  R                  [(        R+                  U UUR,                  UR.                  [0        R2                  R4                  R7                  UR8                  5      4SS95        UR                  [        R                  :w  a/  U R&                  R                  [(        R                  X5      5        g g g )N)r6   )rv  rf   )rM   r  	zero_moder<   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  r   rb   rS   r;   maximumra  r"   r   r   r   r2  make_allocationr  r   r5   rK   rQ   r  r   )r  wsr   r   priors        rV   generate_workspace_allocation2PythonWrapperCodegen.generate_workspace_allocation
  s   {{}D%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%66:JJ< <   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rX   c                v    UR                   [        R                  :w  a  U R                  [	        X5      5        g g rJ   )r  r<   r  r   r"  )r  r  s     rV   generate_workspace_deallocation4PythonWrapperCodegen.generate_workspace_deallocation
  s,    <<,;;;NN.t89 <rX   c                $    U SU R                    3$ )Nz.zero_())r  )r  r   s     rV   r  %PythonWrapperCodegen.make_zero_buffer
  s    x}--rX   c                H    U SSR                  U5       SU R                   3$ )Nr  r  r   )r   r  )r  r   r  s      rV   r%  %PythonWrapperCodegen.wrap_kernel_call
  s'    q9-.a}==rX   c                    U R                   R                  S5        U R                   R                  S[        R                  R                   S35        UR                  U R                   R                  5       5        g )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r5   rK   graph_idrE  r   )r  rT  s     rV   rG  8PythonWrapperCodegen.generate_profiler_mark_wrapper_call
  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rX   c                :    U R                   R                  S5        g )Nzstart_graph())r  r   r  s    rV   rH  )PythonWrapperCodegen.generate_start_graph
  s    ##O4rX   c                `    U R                   R                  S[        R                  < S35        g )Nz
end_graph(r   )r  r   r"   profile_bandwidth_outputr  s    rV   rL  'PythonWrapperCodegen.generate_end_graph
  s'    ##j1P1P0SST$UVrX   c                    U R                   R                  [        R                  R                  R                  5       5        g)z<Synchronize GPU to ensure proton captures all kernel events.N)r  r   r5   rK   ru  r  r  s    rV   rM  -PythonWrapperCodegen.generate_proton_finalize
  s)    ##AGG$6$6$B$B$DErX   c                ^    U R                   R                  S[        R                   S35        g )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r'   r   r  s    rV   rK  6PythonWrapperCodegen.generate_reset_kernel_saved_flags
  s2      ''8'A'A&B C	
rX   c                ^    U R                   R                  S[        R                   S35        g)a  
Precompile and save the CUBINs of the Triton kernels that haven't
been precompiled and saved as a side effect of running the generated
JIT model (Python wrapper). This can happen when the model contains
control flow: only one pass through the control flow operators covers
the kernels that are saved, the remaining kernels are not launched,
hence not saved. The main purpose of this codegen is to compile and
save the Triton kernels outside the active control flow path for
subsequent AOTInductor code generation and compilation.
r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r  s    rV   rN  5PythonWrapperCodegen.generate_save_uncompiled_kernels
  s4     	  ''8'A'A&B C
	
rX   c                B    S nU Vs/ s H
  o2" U5      PM     sn$ s  snf )Nc                   [        U [        5      (       a  [        U 5      (       a  U S-   $ U $ [        U [        [        [
        [        45      (       a  [        U 5      $ [        [        R                  R                  R                  U 5      5      $ )Nr
  )rb   r   r@   r   floatr   r   r  r5   rK   rQ   rR   )r  s    rV   wrap_argAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg
  sg    #s##*B3*G*GsYPSPC#udO!DEE3xQWW--66s;<<rX   rz   )r  r  r  r  s       rV   prepare_triton_kernel_call/PythonWrapperCodegen.prepare_triton_kernel_call
  s%    	= *33#333s   c                  ^  [        U[        5      (       Ga!  [        U[        R                  5      (       a.  UR	                  5       R                  5       nT R                  U   nObT R                  R                  U5      (       a  UnT R                  U   nO0Uc   S5       eST R                   3nUnT =R                  S-  sl        Uc
   SU 35       e[        S UR                  5        5       5      n[        S [        R                  R                  U5       5       5      n[        S UR                  5        5       5      nUR                  5       n	UR!                  5       n
[        R                  R"                  R%                  UR'                  5       R(                  [*        R,                  S9nS	U S
U SU	 SU
 S
U S
U S3nT R.                  R1                  U SU 35        [        U[        R                  5      (       a2  T R3                  USS9nUnT R.                  R1                  U SU 35        U$ [5        U[6        R8                  5      (       d  [        U[:        5      (       a  [        U[<        5      (       a  UT R>                  ;   a  U$ Uc  gUn[        U[:        5      (       a  UR@                  nU[        R                  R"                  RB                  ;   a'  [        R                  R"                  RB                  U   n[=        [        R                  R"                  RE                  U[*        R,                  S95      $ [        U[<        [F        [H        [J        45      (       a  [=        U5      $ [        U[L        5      (       a  SS
RO                  U 4S jU 5       5       S3$ [Q        S[S        U5       35      e)NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r6   z Failed to find a buffer for arg c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  Nr5   rK   rQ   r  r"   unbacked_symint_fallbackr{   ru  s     rV   r}   BPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  sA      
 (A	   ;;#<< <  (   AAc              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  r  r  s     rV   r}   r    sA      $
 :A	   ;;#<< <  :r  c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  r  r  s     rV   r}   r  "  sA      
 *A	   ;;#<< <  *r  r  zgenerate_example_value(r  z, 'z', r   r   T)r  r  r#  r  c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frJ   r   )r{   ar  s     rV   r}   r  S  s(      ZVYQR!@!@DG!L!LVYr   r  zUnsupported type )*rb   torch_dtyper#   rq  
get_tensorrM   r  r   r  r   rs  r5   rK   get_allocation_sizert  r  rP   rQ   r  rd   rg   r"   r  r   r   r  r  rs   Basicr   r   r  r   r  r  r   r  r   r  r   r(  r   )r  r  arg_typeraw_argr*  ru  re   allocation_sizerf   r  r   rg   r   s   `            rV   r   /PythonWrapperCodegen.generate_example_arg_value  sd   h,,'2#3#344"--/88:**3/%%))#..**3/* X* &d&F&F%GH00A50?L&Fse$LL? 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#344 :: %) ;  **44zUG5LMO%++..*S/2R2R#s##$//)J?!#//nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t455s8OT""tyy ZVY ZZ[[\]]%(9$s)&EFFrX   c                   ^  [        U[        5      (       a!  SSR                  U 4S jU 5       5      -   S-   $ [        U5      $ )Nr  r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frJ   )_grid_dim_str)r{   ru   r  s     rV   r}   5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>Z  s     R\T 2 24 8 8\s   !r  )rb   r  r   r  )r  grid_per_dims   ` rV   r  "PythonWrapperCodegen._grid_dim_strW  s?    lD))diiR\RRRUXX &&rX   )r  r   r  r  r  r  r   c                  U R                   R                  U V
s0 s H:  n
[        U
[        5      (       d  M  U
[        R
                  R                  U
5      _M<     sn
5        U=(       d    [        R
                  R                  5       nU R                  [        U UUUUUUUU[        R
                  R                  U	S95        gs  sn
f )z
Generates kernel call code.

triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
        and C++ when gpu=False.
)
r  r  r  r  r  r   r  r  r  r   N)r  r{  rb   r   r5   rK   try_get_bufferrt  r   r  r   )r  r  r  r  r   r  r  r  r  r   r  s              rV   generate_kernel_call)PythonWrapperCodegen.generate_kernel_call_  s    , 	## %$Cc3' 1QWW++C00$	
 @177>>@'#!!#'77<<%9!	
s
   C$Cr  )r  r   r  r  r  r  r  r   c          
     
  ^ ^^ U=(       d    [         R                  R                  5       nU(       d  UR                  S:w  a  UR                  S:X  a"  T R	                  T R                  TU5      5        g UR                  S:X  a%  T R	                  T R                  T S3U5      5        g [        SUR                   S35      eT R                  U5      nSR                  U5      n[        R                  T UR                  U	5      nU(       d$  SU S	3nT R	                  T S
T SU SU S	35        g T R                  5         [        R                  R                  (       Ga  TT R                   ;  Ga  Ub  [#        U5      [#        U5      :X  d   S5       eS mU
(       aI  [         R                  R$                  (       a*  [         R                  R$                  R'                  U
S 5      mSUU 4S jjnUU 4S jn/ nUc)  Ub   S5       eS /[#        U5      -  nS /[#        U5      -  nO[#        U5      [#        U5      :X  d   S5       e0 n[)        [+        X%Xg5      5       GH  u  nu  nnnnS n[-        U[.        5      (       a#  S[/        U5      ;   a  UR1                  S5      u  nnS nT(       a  UT;   a  T R3                  TU   5      nU(       aQ  Un[-        U[4        5      (       d9  [7        U[8        R:                  5      (       d  [-        U[<        5      (       a  UUU'   OUS:X  a  U" XgUU5      (       a  UU   nO[-        U[4        5      (       ag  [>        R@                  " SU5      (       a  UnO6UT RB                  ;  a  T RE                  UUU5      nOT RB                  U   S   nUT4T RB                  U'   OT RE                  UUU5      nURG                  Uc  UOU SU 35        GM     T RH                  R	                  S[         R                  RJ                  RM                  UR                  5       S35        T RH                  RO                  5         T RH                  R	                  T SSR                  U5       SU S	35        T RH                  RQ                  5         T RH                  R	                  [S        SUS5      5        T R                   RU                  T5        [         R                  RV                  (       a  g [         R                  RX                  RZ                  nUR]                  UTUS 5        U   T R	                  T SU SU S	35        S S S 5        T R                  5         g ! , (       d  f       N= f)Ncudar  mpsz.generated_kernelzdevice z nyir  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                    > TR                   R                  5        V Vs/ s H  u  pUT:X  d  M  U PM     nn nU(       a  SSR                  U5       S3$ gs  snn f )zAfter all the autotune kernel calls have been written (i.e.
self.kernel_autotune_example_args is complete), returns a deletion call
for all autotune example tensors that are unnecessary after kernel_name
is called.del r  rp  r  )r  r  r   )r  kntensors_to_deleter  r  s      rV   get_autotune_deletion_callUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call  se     '+&G&G&N&N&P%&P
[( &P " %
 %!$)),=">!?rBB%s
   AAc                  > X   nXC;   a  g[        [        X5      5       H  u  nu  pgXR:X  d  [        U[        5      (       d  M$  SnT(       a  UT;   a  TR	                  TU   5      nUS:X  a  MO   UR                  5       n	[        U	R                  5       H  u  pX:X  d  M  U SU
 S3X4'       g   M     g! [         a     M  f = f)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
This is particularly useful for jagged cases, where the dimension is often
being passed in as an input.Tr  z.shape[r  F)r  r   rb   r%   re  rd   re   r(  )r  r  r  reused_args
target_argr  raw_keyr  triton_inputr6  r  r}  autotune_argsr  s               rV   infer_arg_by_inputsNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs  s    
 &]
,-6s87N-O)A)xz'6'B'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&<FC =IN'RUQVVW:X 7'+ '= .P, 	 / ! !!s   0-B5!B5.B55
CCzkeys are not None but args arez#call_args and raw_args do not matchr  r  z^(workspace|semaphore)r   rq  rr  z.run(z	, stream=z
<del_call>r  )/r5   rK   rt  r   r   r%  rp  r  r   r2  r
  r  rS  r"   r   r   r   r   autotuning_mappingr   r  r   rb   r   splitre  r  r  rs   r  r   r4  matchr  r   rj   r   ru  rx  r:  re  r*   r   rs  r  r  r  )r  r  r  r  r   r  r  r  r  r  r   call_args_strstream_name
stream_ptrr  r  all_argsr  r  r  r  r  r  r   r  arg_strr  r  s   ``                         @rV   r  1PythonWrapperCodegen._generate_kernel_call_helper  s    @177>>@&++/{{e#t44[)LM  %))[M9J*KYW
  #WV[[M#>?? 77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM2224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($! B H'I)II' 6C	N2 6C	N28}I6 96 K8AI(=944C7G c3''C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<<"8U[[99%c?;;/6G,]':;( (
 *'2G+66 xx 93??"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUG3;se1WI<NOU9Z &&00**77EFaH &&002&&00-uTYYx%8$9;-qQ &&224&&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW"NNk]%i}TUVW #%%' #"s   6U**
U8c                :    U R                   R                  U5        g rJ   )r  rj   r  r   s     rV   r   PythonWrapperCodegen.writelineK  s    

$rX   c                8    U H  nU R                  U5        M     g rJ   r9  )r  r  r   s      rV   r"  PythonWrapperCodegen.writelinesN  s    DNN4  rX   c                L    U R                   R                  [        U5      5        g rJ   )r  rj   r0   )r  ctxs     rV   rE  "PythonWrapperCodegen.enter_contextR  s    

+c*+rX   c                @  ^ ^	 SSK Jn  U" 5       (       a  SS Kn[        U[        5      (       a  [        UR                  R                  5      $ [        U[        R                  5      (       a  [        U5      $ [        U[        [        45      (       aB  [        R                   " S S5      5       m	[        [        U5      " U	U 4S jU 5       5      5      $ [        U[         R"                  R$                  5      (       a  ['        U5      $ [        U[(        R*                  [(        R,                  [.        45      (       a  UR1                  5       $ U" 5       (       a0  [        UWR2                  R4                  5      (       a  [        U5      $ [        U[(        R6                  5      (       a  UR1                  5       $ [9        [        U5      5      (       aE  [;        U5      u  pVUR=                  5        H"  u  pxU[>        R@                  RB                  U'   M$     U$ [        U5      $ )Nr   )has_triton_packagec                  &    \ rS rSr% S\S'   S rSrg)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimia  r   refc                    U R                   $ rJ   )r  r  s    rV   __repr__:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__e  s    88OrX   rz   N)r   r   r  r  r   r  r  rz   rX   rV   Shimr
  a  s    $rX   r  c              3  \   >#    U  H!  nT" [         R                  TU5      5      v   M#     g 7frJ   r  )r{   r  r  r  s     rV   r}   6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>j  s)     VTUq1@@qIJJTUs   ),)"torch.utils._tritonr  r   rb   r   r  rS   r  rs   r   r   r  r  	dataclassr_  r   r)  _ops
OpOverloadr   r#   rc   
MutableBoxr&   r  languager   rt  r   r   r  r5   rK   opaque_value_type_classes)
r  r}  type_r  r   obj_repropaque_typesnr  r  s
   `        @rV   r  #PythonWrapperCodegen.val_to_arg_strU  s   :a""%%5::&&8OE4=))""$ $ #$ QVTUVV  5::0011&q))BIIr}}oFGG&&((!!jFOO4I4I&J&J7N2,,--&&((!$q'**%8%;"H$**,7811!4 -O7NrX   c           	     Z   UR                  5       nUR                  5       n[        UR                  5       5      n[        [        R
                  R                  U5      5      n[        UR                  5       5      nUR                  5       nU R                  UR                  5       X#XFXW5      $ rJ   )r  rP   r   rs  r5   rK   r  rt  get_is_pinnedr  rM   )r  r[  r  r   rv  allocation_shaperf   	is_pinneds           rV   r*  +PythonWrapperCodegen.make_buffer_allocation}  s    ""$  "foo'( !<!<V!DEv((*+((*	##OOve=M
 	
rX   c                |    Sn[         R                  R                  (       d  U R                  R	                  USS9  g g )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r5   rK   rs  r  r   rQ  s     rV   "write_memory_track_allocation_once7PythonWrapperCodegen.write_memory_track_allocation_once  s4    
 ww""LL
$7 #rX   c                   Uc  UnU R                  U5      nU R                  U5      n	U R                  U5      n
[        R                  R                  R                  R
                  (       a  U SU	 SU
 SU SUR                   SU S3nOmUR                  S:X  a  U(       a  U SU	 SU
 SU S	3nOGUR                  S
;   a  U SUR                   SU	 SU
 SU S	3
nOU SU	 SU
 SUR                   SU S	3
nX:w  a  USU SU
 S	3-   nU$ )Nz = tracked_empty_strided(r  z, dtype=r  z	', name='r  r  z = empty_strided_cpu_pinned(r   )r  r  xpumtiaz = empty_strided_r  z = empty_strided(r  z.as_strided()r   r)  r.  r"   r/  r0  r   )r  r   r  r   rv  rf   r   r!  r{  codegen_allocation_shape_tuplecodegen_stride_tupler  s               rV   r  $PythonWrapperCodegen.make_allocation  s    #$"==eD)-)H)H*
&  $>>vF??!!..EE&112"'( )  !;;- (b"  [[E!i&412"'('  [[:: &)&++a12"'('  &)12"'( )!;;-yq:  @,':&;2>R=SSTUUC
rX   c                8    U R                  [        U5      5        g rJ   )r   rP  r   s     rV   make_comment!PythonWrapperCodegen.make_comment  s    {4()rX   c           	     `    U R                    U SU U R                   SU R                   SU 3	$ )Nr      )r  r  r  )r  new_nameold_namer  s       rV   make_tensor_alias&PythonWrapperCodegen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^rX   c                (    SUR                  5        3$ )Nr  )rM   )r  r[  s     rV   r  %PythonWrapperCodegen.make_buffer_free  s    foo'())rX   c                8    SSR                  S U 5       5       3$ )Nr  r  c              3  $   #    U  H  ov   M     g 7frJ   rz   )r{   r   s     rV   r}   :PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s     >s   )r   )r  names_to_dels     rV   make_free_by_names'PythonWrapperCodegen.make_free_by_names  s    dii>>>?@@rX   c           	     `    U R                    U SU U U R                   SU R                   S3	$ )Nr   r0   reuse)r  r  r  )r  r3  r2  del_lines       rV   codegen_exact_buffer_reuse/PythonWrapperCodegen.codegen_exact_buffer_reuse  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrX   c                R    Ub$  U R                  U R                   SU SU 35        g g )Nz [Provenance debug handles] rr  )r   r  )r  r  debug_handles      rV   write_provenance_debug_handle2PythonWrapperCodegen.write_provenance_debug_handle  s4    
 #NN<<. <[M<.Y $rX   c                   UR                  5       UR                  5       :X  d   eUR                  5       nUR                  5       nSnU[        R                  R	                  5       ;  a  U(       a  SU R                  U5       3nUR                  5       UR                  5       :X  a4  UR                  5       UR                  5       :X  a  U R                  XEU5      $ U R                  XR                  5       UR                  5       SU R                  R                  5      nU R                   U SU U SU R                   S3$ )N;z; r   r   r0  r?  )rP   rM   r5   rK   r  r  rs  rt  rA  r  r  r   r  r  )r  r;  r:  rM  r3  r2  r@  reinterpret_views           rV   rQ  &PythonWrapperCodegen.make_buffer_reuse  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228xPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rX   c                    U R                  [        UU R                   U SUR                  5        U R                   SU R
                   S35      5        g )Nr   r0  z alias)r   r9   r  r  r  r  )r  r   rF  s      rV   rG  0PythonWrapperCodegen.codegen_deferred_allocation  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rX   c                  ^ UR                  5       nU[        R                  R                  ;   dM  X R                  ;   d>  [        U[        R                  [        R                  [        R                  45      (       a  g U R                  R                  U5        [        UR                  5       [        R                  [        R                  45      (       a  UR                  5       (       d  g UR                  5       n[        U[        R                   5      (       a  g [        U[        R"                  5      (       a  g [        U[        R$                  5      (       Ga?  [        UR&                  [        R(                  5      (       d*   S[+        UR&                  5       SUR&                   35       eUR&                  R,                  n[        U[        R.                  5      (       d   [+        U5      5       eUR,                  n[        U[        R0                  [        R(                  45      (       d   [+        U5      5       e[        U[        R(                  5      (       a  SU4S jjmT" U5      nU R3                  U5        U R5                  [7        XX5      5        g [        U[        R8                  5      (       a  U R5                  [;        X5      5        g U R5                  [=        X5      5        g )Nzunexpected r   c                2  > [        U [        R                  5      (       a  T" U R                  5       5      $ [        U [        R                  5      (       a  T" U R
                  5      $ [        U [        R                  5      (       d   [        U 5      5       eU $ rJ   )rb   r#   BaseViewunwrap_viewr  rk   rc   r   )targetunwrap_viewss    rV   rR  =PythonWrapperCodegen.codegen_allocation.<locals>.unwrap_views  sm    !&"++66+F,>,>,@AA!&"--88+FKK88%fbii88F$v,F8!MrX   )r   r\  )rM   r5   rK   r  r  rb   r#   DonatedBufferSubgraphBufferInputBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocaterf  MutationLayoutSHOULDREMOVEr  rE  rF  r&   r   rk   ri   rc   codegen_allocationr   r@  rg  rq  r  )r  r[  r   r6  boxinput_bufferrR  s         @rV   r\  'PythonWrapperCodegen.codegen_allocation  s#     AGG+++~~%&2#3#3R5F5F"WXX4 &&(%%r~~6  **,,'')fb;;<<fbmm,,fb0011fkk2+=+=>> d6;;/06;;-@> ++""Cc2==11<49<188LlRYY8J8J,KLL dO L ,(:(:;;"  ,L9##L1NN?4vNOfb1122NN1$?@|D12rX   c                   UR                  5       n[        U[        R                  [        R                  45      (       a  U R                  [        X5      5        g [        UR                  5       [        R                  5      (       a  U R                  [        X5      5        g U R                  U5      (       d  g U R                  R                  U5        U R                  [        X5      5        g rJ   )rM   rb   r#   rV  rr  r   r  rf  rg  r  	can_reuser  r   r"  )r  r[  r   s      rV   codegen_free!PythonWrapperCodegen.codegen_free(  s      fr~~r/A/ABCCNN8D12f,,.0C0CDD NN-d;<~~f%%

t*489rX   c                2   UR                  5       nU[        R                  R                  ;   =(       d    U[        R                  R                  ;   =(       a:    [        [        R                  R                  U   [        R                  5      (       + =(       dz    U[        R                  R                  ;   =(       dV    U[        R                  R                  ;   =(       d2    U[        R                  R                  ;   =(       d    X0R                  ;   (       + $ rJ   )rM   r5   rK   r  r  rb   graph_inputs_originalr#   rT  r#  r$  never_reuse_buffersr  )r  r^  output_bufferr   s       rV   ra  PythonWrapperCodegen.can_reuse<  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" zz!
 	
rX   c                    UR                  5       U R                  ;   =(       a.    U R                  UR                  5          UR                  5       :H  $ rJ   )rM   r  )r  r[  reused_buffers      rV   	did_reusePythonWrapperCodegen.did_reuseL  sC     OO, KFOO-.-2H2H2JJ	
rX   c                z   [        X5      (       d   eU R                  U5        U R                  R                  UR	                  5       5        U R
                  R                  UR	                  5       5        UR	                  5       U R                  UR	                  5       '   U R                  [        XU5      5        g rJ   )	r`   r\  r  r   rM   r  r  r   r"  )r  r^  rg  s      rV   codegen_inplace_reuse*PythonWrapperCodegen.codegen_inplace_reuseT  s    $\AAAA-

|,,./=11340<0E0E0GM**,-y]CDrX   c                    [        U5      nX R                  ;   a  U$ U R                  R                  U5        U R                  U-   $ rJ   )r   r  r   r  )r  r   r   s      rV   codegen_unbacked_symbol_decl1PythonWrapperCodegen.codegen_unbacked_symbol_decl\  sA    6{---K &&**40<<$&&rX   c                    [        [        R                  R                  R                  U5      nU R                  [        XX#5      5        g rJ   )r   r5   rK   rQ   	shape_envr   r  )r  r  r  r  s       rV   (codegen_unbacked_symbol_defs_for_outputs=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputse  s=     6GG&&(9
 	"4gQ	
rX   c                   ^^^^ U(       d  g UR                  5        HM  u  nmSU4S jjmUUUU4S jnU R                  U R                  U5       SU" 5        U R                   35        MO     g )Nc                  > US:X  a  U $ [        U5      S:  ai  [        US   [        5      (       aQ  [        US   [        R                  5      (       a/  T" U  SUS   R
                   SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R
                   S3USS  5      $ [        US   [        R                  5      (       a^  [        R                  R                  (       a   T" S	US   R                   S
U  S3USS  5      $ T" U  SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R                   S3USS  5      $ [        SU 35      e)Nrz   r    r   r6   r   r  r   r  z	std::get<z>(r  r  z.__floordiv__(r  )r   rb   r   pytreeSequenceKeyr   r  r5   rK   rs  r   r  ra  )r  r  gos     rV   r{  JPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go  s   b=K LA%"71:}=="71:v/A/ABB&'!*//!2!GAJNN3C1Ewqr{   
M::a
'8;WQR[II
F,>,>?? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K88 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrX   c                   > [         R                  R                  (       a  [        T5      S:X  a`  TS   n T" TS   R	                  5       [        U [        R                  5      (       a"  [        U R                  5      S:w  a	  TSS  5      $ T5      $ [        TS   [        R                  5      (       d   eT" TTS   R                     R	                  5       TSS  5      $ T" TT5      $ )Nr6   r   )r5   rK   rs  r   rM   rb   r#   rY  r  ry  rz  r  )r  r{  r  r  r  s    rV   go_outerPPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer  s    77&&
 7|q(%aj  "#AJ//1)#r~~>>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIIII!''!*.."9"B"B"DgabkRRk733rX   r   )r  r   r  zpytree.KeyPath)r  r   rq  r  )r  r  r  r  r}  r~  r{  r  s    ``   @@rV   r  >PythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputsr  sg     ! ,113JAw
L<4 4. NN44Q78HJ<}Uu 4rX   c                  ^ ^^^ UU U4S jnUU U4S jn T R                  TR                  5        T R                  T R                   STR                   35        U" 5         [
        R                  n[
        R                  " TR                  5         TR                  R                  US9  S S S 5        U" 5         T R                  5         g ! , (       d  f       N&= f! T R                  5         f = f)Nc                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H3  u  pTR	                  TR
                   U  SU TR                   35        M5     g r  )r   rK   r  r   r   r  r  )inner_inputouter_inputouter_inputsr  subgraphs     rV   _codegen_subgraph_prefixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix  sr    x~~223s<7HHHH,/++\-( ||n[M[M$++O-rX   c                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H5  u  pTR	                  U SU R                  5        TR                   35        M7     g r  )r   rK   r  r   r   r  r  )inner_outputouter_outputouter_outputsr  r  s     rV   _codegen_subgraph_suffixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  st    x~~334M8JJJJ.1,,m/* #nC(F(F(H'I$++W/rX    subgraph: )parent_graph)	r9  rK   r   r  r   r5   set_graph_handlercodegen_subgraphrd  )r  r  r  r  r  r  r  s   ````   rV   codegen_subgraph_by_inlining1PythonWrapperCodegen.codegen_subgraph_by_inlining  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4//!- 0  5 %&$$& 54 $$&s$   A<C C,C 
CC C/c           	        UR                   nUR                  n[        UR                  5       5      UR                   Vs/ s H  oUR
                  PM     sn-   nSR                  U5      [        U5      S:X  a  SOS-   nU Vs/ s H  oR                  5       PM     n	nSR                  U	5      [        U5      S:X  a  SOS-   n
U R                  SU SU S35        UR                  5        VVs/ s H  u  pU(       d  M  UPM     nnnU(       a#  U R                  SSR                  U5       35        U R                  S	U
 S
U SU S35        U R                  SU S35        gs  snf s  snf s  snnf )z'Generate code to call a graph partitionr  r6   r  r  	partition	_args = [r  r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr  r  symbol_inputsr   r   r   rM   r   r  )r  partition_idr  r  r  symbol_inputr  r  rS   output_namesr  r   
deallocater;  s                 rV   codegen_partition_call+PythonWrapperCodegen.codegen_partition_call  sn    2DD+88-22452F2T2T9
2T,2T9
 
 ;'#k2Ba2G3RP4@ALDLA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
)C%TzD)C 	 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E"E'-E,>E,c                V    [        U5       Vs/ s H  nSU 3PM
     snU l        g s  snf )N
partition_)r  r  )r  num_partitionsr  s      rV   set_all_partition_names,PythonWrapperCodegen.set_all_partition_names  s*    BGBW#XBW3j$6BW#X #Xs   &c           	     t   SR                  U5      [        U5      S:X  a  SOS-   nSR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        U R                  SU SUR                  R                   SUR                  R                   S	35        g )
Nr  r6   r  r  r  r  r  z) = r  )r   r   r   rK   r   )r  r  r  outer_flattened_outputsouter_output_namesouter_input_namess         rV   ,codegen_subgraph_call_with_flattened_outputsAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rX   c                x   SR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        [
        R                  R                  R                  5         U R                  U SUR                  R                   SUR                  R                   S	35        g )
Nr  r6   r  r  r  r  r   r  r  )r   r   r   rK   r   r5   r  free_buffers)r  r  r  outer_buffer_namer  s        rV   codegen_subgraph_call*PythonWrapperCodegen.codegen_subgraph_call!  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rX   c                   U R                  UR                  5        U R                  S5        U R                  U R                   SUR                   35        [
        R                  nUR                  UR                  l        UR                  UR                  l        UR                  R                  U R                  ;  a  [
        R                  " UR                  5         [        R                  " SS5         UR                  R                  5       u  p4S S S 5        S S S 5        UR                  R                  nU R                  R                  U5        U R                  UW5        g g ! , (       d  f       N[= f! , (       d  f       Nd= f)Nr  r  r  F)r9  rK   r-  r  r   r5   rs  
fx_wrapperr  r  r"   patchr=  r   r?  )r  r  r  r>  r!  r  s         rV   codegen_subgraph_common,PythonWrapperCodegen.codegen_subgraph_common2  s   !!(..1"T\\N+hmm_EFww%1%=%="$0$;$;!>>d&F&FF $$X^^4\\"3U;'/~~'='='?$M < 5
 %NN//M,,00?,,]MJ G
 <; 54s$   !E<9E+E<+
E9	5E<<
F
c                J    U R                  U5        U R                  XU5        g rJ   )r  r  )r  r  r  r  s       rV   'codegen_subgraph_with_flattened_outputs<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputsG  s&     	$$X.99$;	
rX   c                J    U R                  U5        U R                  XU5        g rJ   )r  r  )r  r  r  r  s       rV   r  %PythonWrapperCodegen.codegen_subgraphO  s#     	$$X.""8;LMrX   c                   UR                  5       nU R                  U S[        UR                  5       35        UR                   Vs/ s H  o3R                  5       PM     nn[        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  oR SU S3PM     nnU R                  UR                  XF5        g U R                  UR                  XB5        g s  snf s  snf )N = [None] * r  r  )rM   r   r   r  r  r  r5   rK   rt  r  r  r  r  )r  invoke_subgraphr   ru  r  r  r  s          rV   codegen_invoke_subgraph,PythonWrapperCodegen.codegen_invoke_subgraphU  s    '')$|C0G0G,H+IJK;J;Q;QR;QC--/;QR77(-c/2I2I.J(K(K1&!A(K   --((, !!/":":LO Ss   C/"C4c                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nnUR                  R                  5       n[	        UR                  [
        R                  5      (       d  U S3nU R                  U S[        UR                  5       35        U R                  SU S35        U R                  [        XR                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR                  XG5        OU R#                  UR                  XB5        U R                  [%        U 5      5        U R                  S5        U R                  [        XR&                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR&                  XG5        OU R#                  UR&                  XB5        U R                  [%        U 5      5        g s  snf s  snf s  snf )Nr
  r  r   rr  r  r  zelse:)rM   operandsr  	predicaterb   r#   ShapeAsConstantBufferr   r   r  r1  true_subgraphrK   r5   rt  r  r  r  r_  false_subgraph)r  conditionalr   ru  r  r  r  r  s           rV   codegen_conditional(PythonWrapperCodegen.codegen_conditionale  s   ##%;F;O;OP;OC--/;OP));;=	+//1I1IJJ$+W-I$|C0C0C,D+EFGYKq)*(/H/H/N/NOP775:3{?R?R;S5TU5TvQqc^5TMU--))< !!+";";\P'-.w(/I/I/O/OPQ775:3{?R?R;S5TU5TvQqc^5TMU--**L !!+"<"<lQ'-.9 Q V Vs   I8-I=Jc                ,	  ^  U 4S jnUR                  5       nUR                   Vs/ s H  oUR                  5       PM     nnUR                   Vs/ s H  oUR                  5       PM     nn[	        U5      nT R                  U S[	        U5       35        U(       a   T R                  U S[	        U5       S35        [        U5       H  u  pT R                  U SU	 SU
 35        M      / [        [	        U5      5       V	s/ s H
  o SU	 S3PM     sn	QUQnU S3/n[        U5      nUS	[	        U5       nU" UR                  X5        T R                  S
US    35        T R                  S5        U(       ax  [        U5       Hh  u  pT R                  [        T UR                  R                  5      5        T R                  U SU	 SU S35        T R                  [        T 5      5        Mj     Ow[        U5       Hh  u  pT R                  [        T UR                  R                  5      5        T R                  U SU	 SU S35        T R                  [        T 5      5        Mj     T R                  S5        T R                  [        T UR                  R                  5      5        U" UR                  X5        T R                  [        T 5      5        U(       a  T R                  [        T UR                  R                  5      5        [        [	        U5      5       H"  n	T R                  U SX-    SU SU	 S35        M$     T R                  [        T 5      5        T R                  [        T UR                  R                  5      5        U" UR                  X5        T R                  [        T 5      5        T R                  SUS    35        U(       a  T R                  S5        [        [	        U5      5       H  n	T R                  SU SX-    S35        T R                  [        T UR                  R                  5      5        T R                  U SU	 SU SX-    S35        T R                  [        T 5      5        M     g	g	s  snf s  snf s  sn	f )z1while_loop is codegened as a host side while_loopc                   > [         R                  R                  (       a  TR                  XU5        gTR	                  XU5        g)z3Helper method to deduplicate subgraph codegen logicN)r5   rK   rt  r  r  )r  r  r  r  s      rV   r  APythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraph  s3    ww11(-X<<MrX   r  z.extend([[] for _ in range(z)])r  z] = r  _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()r  zwhile should_loop:z	].append(r!  z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rM   carried_inputsr  additional_inputsr   r   r  r  r  cond_subgraphr1  body_subgraphrK   r_  )r  
while_loopstack_outputr  r   ru  outer_carried_inputsouter_additional_inputs
ckp_offsetr  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputs   `               rV   codegen_while_loop'PythonWrapperCodegen.codegen_while_loop  s]   	 ""$/9/H/H 
/H!!#/H 	  
 0:/K/K#
/K!!#/K 	  #
 -.
$|C0D,E+FGHNN&3C8L4M3NcR   45FANNdV1QCtC512 6
&+C0D,E&FG&Fas!n&FG
$
 "&l34 
 //J5I1JK$$&7	
 	(:1(='>?@,-$-.B$C 0z7O7O7U7UVW$q4>STU/56 %D
 %..B$C 0z7O7O7U7UVW$q4hGH/56 %D
 	+,(z/G/G/M/MNO$$&7	
 	'-. NN,T:3K3K3Q3QRS3345$q(8	$q2NO 6NN+D12 	(z/G/G/M/MNO$$&7	
 	'-.+,>q,A+BCD NNBC3345a/?wGH0z7O7O7U7UVWfAaS 0a7GyQ /56 6 Q 
#
  Hs   RR?Rc                     [        U SS 5      (       a  g [        U [        5      (       a  U $ [        R                  R
                  R                  U 5      nUc  U$ [        U5      $ ! [         a     g f = f)Nr  )r  rb   r   r5   rK   
_shape_env_maybe_evaluate_staticro  )r   r  s     rV   statically_known_int_or_none1PythonWrapperCodegen.statically_known_int_or_none  sl    	q.$// !S!!''$$;;A>C{
s8O 		s!   A% A% -A% 
A% %
A21A2c                r    / nU  H.  n[         R                  U5      nUc    g UR                  U5        M0     U$ rJ   )r2  r  rj   )lstr  r   rp  s       rV   %statically_known_list_of_ints_or_none:PythonWrapperCodegen.statically_known_list_of_ints_or_none  s<    A&CCAFC{MM#	 
 rX   c                0    [         R                  U 5      S L$ rJ   )r2  r  )r  s    rV    is_statically_known_list_of_ints5PythonWrapperCodegen.is_statically_known_list_of_ints  s     !FFsKSWW	
rX   c                H    [         R                  U R                  5       5      $ rJ   )r2  r  rs  r[  s    rV   r#  4PythonWrapperCodegen.static_shape_for_buffer_or_none  s    #IIOO
 	
rX   c                0    [         R                  U 5      S L$ rJ   )r2  r#  r  s    rV   !can_prove_buffer_has_static_shape6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKSWWWrX   c                    g rJ   rz   )r  r  node_schedules      rV   write_kernel_context_guard/PythonWrapperCodegen.write_kernel_context_guard  s    
 	rX   c                    g)z,
Mark the beginning of kernel context guard
Nrz   r  s    rV    write_kernel_context_guard_begin5PythonWrapperCodegen.write_kernel_context_guard_begin       	rX   c                    g)z&
Mark the end of kernel context guard
Nrz   r  s    rV   write_kernel_context_guard_end3PythonWrapperCodegen.write_kernel_context_guard_end  r  rX   )/r  r  r  r  r  r  r  r  r  r  r  r  r5  r  r  r  r  r  r  r  r  r  r   r  r  r   r  r  r  rn  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r   rJ   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]rB  )r   r   r  r   r   r#  )r  r   )r^  TritonMetaParamsr   r   r   r  )r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  r  rn  )rl  r   r  r   r   r   r  )rl  r   r   r#  )r  r  r   r#  r  r-   r   r#  )rS   zir.FallbackKernelr   r#  )rS   r  )rS   r  r   r#  )r   r   r  r   r  r   r  r  r  r   r  zOptional[OrderedSet[str]]r   r#  )F)rS   r  )rS   r  r   r#  )r*  r   r  r   r+  zCallable[[], Sequence[str]]r,  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]r  r  r  zSequence[ir.Buffer]r   r#  )r:  Callable[..., None]r   zIterator[Callable[..., None]])r2  r   )r   r   r   r  r  r  )r  r   )r   r   rR   r   r   r   )r   r   r   r   )r  r   r   r   r  r   r   r   )rv  zSequence[Expr]r   r   )r   ztuple[str, list[str]])r   r  r   r   )r  zUnion[bool, str])rS   zir.MultiOutput)NTN)
r  r   r  r   r  r   r  r   r  r   )r  r   r  r   r  r   )r   r   )r   z"list[list[Union[int, sympy.Expr]]])r  r   r  r   )r  r   rK   rD   r   r#  )r  r;   )r  r   )r[  r  )NF)r  )r[  r  )r;  r  )r3  r   r2  r   r@  r   )rD  rm  )r;  r  r:  r  rM  r   )r   r   rF  zir.ReinterpretViewr   r#  r[  r\  )r^  r\  rg  r\  )r  r   r  r   r  r  r   r#  )r  r   r  zir.GraphPartitionSignature)r  r   )r  r   r  z0Union[Sequence[BaseSchedulerNode], ExternKernel])r   r   r  r  r  supports_cachingr  rN  r  r  r  r  rG  r  r)   rS  rX  r[  ra  rf  ri  rm  rd  rv  ry  r|  r  r  r  r   r  r
  r  r9  rd  r4  ra  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r  r-  r3  r6  r   contextmanagerr<  r@  r0  rO  ry  r  rI  r  r  r  rQ  r  r  r  r  r   r{  r  r  r  r  r  r  r  rY  r)  rR  r0  r9  r  r?  r  r  r  r  r  r  r%  rG  rH  rL  rM  rK  rN  r  r   r  r  r  r   r"  rE  r  r*  r$  r  r-  r4  r  r<  rA  rE  rQ  rG  r\  rb  ra  rk  rn  rq  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r  r  r  r  r  r$  r%  s   @rV   r2  r2  "  s    ]#~ 
 FJ	&&$& 7& C	& &'<oBb8  "	B + +! 
 

$	G$
%S$	(
.8):6-.10J
/+7&5
5$:,:8 8 
8 37;; ;  	;
 ; ; 0; 
; ,U
8(B<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,S
j.S`:
 
D%(K(K (K -	(KT'?RD @D W CG +.&'6	, <P '<P 
<P|>S
IG264*_YB, #'(,

 
  	

 
 &
& FJ		'*	6C	 	 #'(,!! !  	!
 ! &!,>N; 2N;`*A"A+8A	A
%N:.>85WF

2
4SGj' !2
2
r !v(v(p !,%P	
 8 8 TY.`*_*Au '+ $` 
53n:(
 
E'

 
 H	

 

JJ J H	J
 
JX+'Z<< 9<BY
$
"K*
NP /B]7~     
 

 
 

 X X H rX   r2  c                    ^  \ rS rSrSr S     SU 4S jjjrSS jrSS jrS rS r	S r
SS	 jrSS
 jrSS jrSS jr  SS jrSS jrSS jrSU 4S jjr\SS j5       r\SS j5       r\S S j5       rS rSrU =r$ )!r  i'  z
A wrapper codegen that generates code for a subgraph. For most of the
methods, we rely on the implementation in the PythonWrapperCodegen. But we
override a few functions to produce cleaner code (like avoiding writing
imports twice in the output code)
c                   > Xl         X l        X0l        [        TU ]  5         U R                  5       nUR                  U l        UR                  U l        UR                  U l        UR                  U l	        g rJ   )
r  r  r  r  r  get_root_graphr  r   r  r  )r  r  r  r  rootr  s        rV   r  %SubgraphPythonWrapperCodegen.__init__/  sh     +,$8!""$$($=$=!%)%?%?"!//)-)G)G&rX   c                &    U R                   U l        g rJ   )r  r  r  s    rV   r  1SubgraphPythonWrapperCodegen.set_launcher_fn_nameF  s     !% 2 2rX   c                    g rJ   rz   r  s    rV   r  )SubgraphPythonWrapperCodegen.write_headerL  rI  rX   c                    g rJ   rz   r-  s     rV   rR  2SubgraphPythonWrapperCodegen.add_benchmark_harnessO  rI  rX   c                    g rJ   rz   r-  s     rV   r)  6SubgraphPythonWrapperCodegen.benchmark_compiled_moduleR  rI  rX   c                    g rJ   rz   r  s    rV   r|  5SubgraphPythonWrapperCodegen.write_async_compile_waitU  rI  rX   c                6    U R                   R                  5       $ rJ   )r  r  r  s    rV   r  /SubgraphPythonWrapperCodegen.next_kernel_suffixX  s    ""5577rX   c                    g rJ   rz   r  s     rV   r  2SubgraphPythonWrapperCodegen.generate_after_suffix\  rk  rX   c                \    U R                   R                  SU R                   S35        SnU$ )Nz
            def z(args):
            r6   )r  r   r  r  s     rV   r  >SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent_  s<    &&' (	

 rX   c                    gr   rz   r  s    rV   r6  4SubgraphPythonWrapperCodegen.get_wrapper_call_indenth  s    rX   c                    U R                   =n(       a6  UR                  UR                   Vs0 s H  n[        U5      U_M     sn-  nU$ [        R
                  R                  nU$ s  snf rJ   )r  input_nodesr  r   r5   rK   r  )r  rN  r}  r  s       rV   rm  -SubgraphPythonWrapperCodegen.get_graph_inputsk  sm     11191**#,#:#:.#:aA	#:. F
  WW))F.s   A&c                   U R                   =n(       aL  [        UR                  R                  5       5      UR                   Vs/ s H  o"R
                  PM     sn-   nU$ [        R                  R                  nU$ s  snf rJ   )	r  r  r  r  r  r   r5   rK   rs  )r  rN  r  namess       rV   r  2SubgraphPythonWrapperCodegen.get_graph_input_namesv  su    11191..33566?6M6M:6Ml!!6M: E
  GG--E:s   A<c                |    U R                   =n(       a  UR                  nU$ [        R                  R                  nU$ rJ   )r  r  r5   rK   r  )r  rN  r  s      rV   rd  .SubgraphPythonWrapperCodegen.get_graph_outputs  s;    11191,,G  gg++GrX   c                   > UR                  5       nU R                  =n(       a  X#R                  ;   a  g [        TU ]  U5        g rJ   )rM   r  r  r  r\  )r  r[  r   rN  r  s       rV   r\  /SubgraphPythonWrapperCodegen.codegen_allocation  s=     222I2@U@U8U "6*rX   c                8    U R                   R                  5         g rJ   )r  rS  r  s    rV   rS  5SubgraphPythonWrapperCodegen.write_triton_header_once  s     	446rX   c                8    U R                   R                  5         g rJ   )r  r[  r  s    rV   r[  =SubgraphPythonWrapperCodegen.write_get_raw_stream_header_once  s     	<<>rX   c                    U n[        U[        5      (       a#  UR                  n[        U[        5      (       a  M#  [        U[        5      (       d   eU$ rJ   )rb   r  r  r2  )r  r
  s     rV   r	  +SubgraphPythonWrapperCodegen.get_root_graph  sK    DH;<<&&D ;<< $ 45555rX   c                    g rJ   rz   r  s    rV   rO  <SubgraphPythonWrapperCodegen.generate_and_run_autotune_block  s    rX   )r   r  r  r  r  r  r  r  rJ   )r  r   r  r2  r  r  rB  r  r  rn  )r   zDdict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]r   r  r  )r   r2  )r   r   r  r  r  r  r  r  rR  r)  r|  r  r  r  r6  rm  r  rd  r\  r)   rS  r[  r	  rO  r  r$  r%  s   @rV   r  r  '  s     FJ	HH -H C	H H.38		M	+ 7 7 ? ?   rX   r  )rS   r  r   r!  )r\   r  r]   r  )NN)r   r   r   zlist[triton.Config]r   zlist[TritonGrid]r   r  r   r   r   ztuple[str, str]r  )
__future__r   r  r   r  r   r$  r~  rg  r  r7  r|  r4  ri  collections.abcr   	itertoolsr   r   typingr   r   r	   r
   rs   r   r)  
torch._opstorch.utils._pytreeutils_pytreery  r   r  torch._dynamo.utilsr   r   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   torch._library.opaque_objectr   r   torch._loggingr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r!   r"   r#   	codecacher$   r%   r&   r   r'   runtime.hintsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   virtualizedr5   rn  r7   r8   r9   r:   r;   r<   	cpp_utilsr=   custom_extern_kernel_codegenr>   triton_utilsr?   r@   rA   rB   rC   r   rK   rD   rE   r  rF   wrapper_fxirrG   	getLoggerr   logdoprintr  r   r  r   r   r!  rc   r  r.  rW   r`   rp   r  r   r  r   r   r   r  r   r
  rH   r1  rF  rP  rW  r_  rk  r  r  r  r  r  r  r  r  r  r"  r@  r"  r   r[  rq  r  r  r  r  r  r  r  Liner2  r  rz   rX   rV   <module>rN     s   "    
     	  	  $ " 6 6     $ $ & E E C A ; R +  . / 9 : ( ( ' ( ' ,       F P P 2%!-) ! u{{C56299l*+
]OT12 >QF S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.k&
k& k& k& ,	k&
 (k& k&\U&p   * **Y Y
 2 2 2 	/k 	/ 	/ ++ + + 	2 	2 	2 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5< 	({ 	( 	( /[ / /> 5; 5 5* ; ; ;2%
 %
P 3,% 3, 3,l 6, 6 6> /( / /& )" ) ).(! (
 ![ ! !: )8^ )8 )8X 4 4 4 #0k #0 #0L 6; 6 6, 4+ 4 48 	5+ 	5 	5 8[ 8 8 
,-B,7 B,JXG#7 GrX   