
    ȅi                   J   % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJrJr  S SKJrJr  S SKJr  S SKJrJrJrJrJrJrJrJr  S SKJrJr  S SK r S SK!r!S SK"r!S SK#J$r$  S S	K%J&r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-  S SK.J/r0  S SK1J2r2J3r3J4r4  S SK5J6r6J7r7  SSK8J9r9J:r:  SSK;J<r<  SSK=J>r>J?r?  SSK@JArA  SSKBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrLJMrMJNrN  SSKOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  \(       ak  S SKWJXrXJYrYJZrZJ[r[  S SK"J\r\  SSK]J^r^  SSK_J`r`JaraJbrbJcrc  SSKdJere  SSKfJgrgJhrhJiri  SSK@Jjrj  SSKkJlrl  \" S 5      rm\X\\h   /\g4   rn\o\l   rp\\q\ R                  4   rs\qrt\!R                  R                  \wS!5      rx\R                  " \w5      rzGSS" jr{\R                   " S# S$5      5       r} " S% S&\R$                  5      r~ " S' S(\5      r\H" S)S*9 " S+ S,\5      5       r " S- S.5      r\R                   " S/ S05      5       r\R                   " S1 S25      5       r\R                   " S3 S45      5       r\R                   " S5 S65      5       r\R                   " S7 S85      5       r\\\\\\4   r0 rS9\S:'    " S; S<5      r0 rS=\S>'   0 rS?\S@'   0 rSA\SB'       GS               GSSC jjr " SD SE\5      r    GSSF jr      GSSG jrGSSH jr GS       GSSI jjrGSSJ jrGSSK jr\GR,                  GSSL j5       r        GSSM jr      GSSN jrGSSO jr\!GR6                  \!GR8                  \!GR:                  \!GR8                  0\!GR<                  \!GR>                  \!GR@                  \!GRB                  \!GRD                  \!GRF                  \!GRH                  \!GRJ                  \!GRL                  \!GRN                  \!GRP                  4 V s0 s H  n X _M     sn ErSP\SQ'           GSSR jr        GSST jr        GSSU jrGSSV jr " SW SX5      r " SY SZ\05      r/ " S[ S\5      r\GR`                  " S]\GRb                  S^9rGSS_ jr " S` Sa\>\\R\   5      r\R                   " Sb Sc5      5       r\" GS0 Sd\" \$GRn                  Se SfSg9_Sh\" \$GRn                  Si Sj SkSl9_Sm\" \$GRn                  Sn So SpSl9_Sq\" \$GRn                  Sr Ss StSl9_Su\" \$GRn                  Sv Sw SxSl9_Sy\" \$GRn                  Sz S{ SyS|9_S}\" \$GRn                  S~ S SSl9_S\" \$GRn                  S S S SS9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S S S SS9_S\" \$GRn                  S S SS|9_S\" \$GRn                  S S SSl9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S S SSl9_S\" \$GRn                  S S SSl9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_S\" \$GRn                  S SSg9_6rS\S'   GSS jr " S S\D5      r " S S\G5      r " S S\5      r\R                   " S S5      5       r " S S5      r\" 5       r " S S5      r " S S5      r\" S\qS9r\" SS\\S9r\(       a  \\!GR                  \T\\\\S4   4   4   r " S S\\\4   5      r " GS  GS5      r " GS GS\\\   5      r\R                   " GS GS5      5       r\GR,                  GS GS j5       r " GS GS5      r " GS	 GS
\?5      rgs  sn f (!      )annotationsN)ABCabstractmethod)autoEnum)chain)AnycastClassVarGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)SelfTypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)ConfigModule)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)ShapePropagationOpsHandler)boolean_opsDeferredLineBasegenerate_assertget_current_backendIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)NullHandlerops
OpsHandlerOpsValueReductionType	StoreModeV)CallableIteratorMutableMappingSequence)GraphModule)CustomGraphModulePass)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode)BlockShapeType   PythonWrapperCodegen_Tschedulec                    [         R                  [        R                  5      (       a  [         R	                  SU 5        g g )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)msgs    X/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/common.pydata_type_loggerrU   a   s-      //6< 0    c                  j    \ rS rSr% SrS\S'   S\S'   SS jr\SS j5       rSS	 jr	\SS
 j5       r
Srg)FileBackedGraphModulef   z
Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
map back to a GraphModule instead of Python source.
r<   gmzCallable[..., Any]compiled_fnc                .   [         R                  " SSSS9U l         [        R                  " [        R
                  U R                   R                  5        U R                    nUR                  U R                  5        S S S 5        g ! , (       d  f       g = f)Nzw+z.pyF)modesuffixdelete)	tempfileNamedTemporaryFileatexitregisterosremovenamewritevalue)selffs     rT   __post_init__#FileBackedGraphModule.__post_init__p   s^     !33eE
 			4==#5#56]]aGGDJJ ]]s   !B
Bc                .    U R                   R                  $ N)r`   rf   ri   s    rT   __file__FileBackedGraphModule.__file__z   s    }}!!!rV   c                     U R                   " U6 $ rn   )r[   ri   argss     rT   callFileBackedGraphModule.call~   s    &&rV   c                .    U R                   R                  $ rn   )rZ   codero   s    rT   rh   FileBackedGraphModule.value   s    ww||rV   )r`   NreturnNoner{   str)rt   	list[Any]r{   r	   )__name__
__module____qualname____firstlineno____doc____annotations__rk   propertyrp   ru   rh   __static_attributes__ rV   rT   rX   rX   f   sF    
 	O##  " "'  rV   rX   c                  H    \ rS rSrSrSrSr\S	S j5       r\S
S j5       r	Sr
g)WorkspaceZeroMode   r   rH   r   c                    X:X  d  U[         R                  :X  a  U $ U [         R                  :X  a  U$ [        SU < SU< S35      e)NzWorkspaceZeroMode.combine(, ))r   UNINITIALIZEDNotImplementedErrorabs     rT   combineWorkspaceZeroMode.combine   sI    6Q+999H!///H!$>qe2aU!"LMMrV   c                P    U (       a  [         R                  $ [         R                  $ rn   )r   ZERO_ON_CALLr   )	zero_fills    rT   	from_boolWorkspaceZeroMode.from_bool   s    $111 ...rV   r   N)r   r   r   r   r{   r   )r   boolr{   r   )r   r   r   r   r   r   ZERO_PER_GRAPHstaticmethodr   r   r   r   rV   rT   r   r      s9    MLNN N / /rV   r   c                  @    \ rS rSrSr\SS j5       r\SS j5       rSrg)	CodegenSymbol   zH
An IR object possibly corresponding to a variable in the wrapper code.
c                    g rn   r   ro   s    rT   get_nameCodegenSymbol.get_name       rV   c                    g rn   r   ro   s    rT   get_exampleCodegenSymbol.get_example   r   rV   r   Nr}   r{   z!Union[torch.Tensor, sympy.Symbol])	r   r   r   r   r   r   r   r   r   r   rV   rT   r   r      s/        rV   r   T)frozenc                  N   \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   SrS	\S'   \R                  r	S\S'   \
S S!S jj5       r\
S"S j5       r\
S#S j5       r\
S#S j5       rS$S jr\rS%S jrS&S jrS'S jr\S'S j5       r\r\r\rS(S jrS)S jrS)S jrS*S jrS+S jrS,S jrSrg)-WorkspaceArg   a  A temporary buffer used for a single kernel, then discarded.

Not registered as a traditional buffer since there are no users,
so it would be dead code eliminated.

Args:
    nbytes: The size of the buffer in bytes.
    zero_fill: Whether the buffer should be initialized to zero.


sympy.Exprcountr   	zero_modetorch.devicedevicer~   
outer_namews_ptr
inner_nametorch.dtypedtypec                P    U  [        [        R                  R                  5       3$ rn   )nextr7   graphworkspace_id)prefixs    rT   unique_nameWorkspaceArg.unique_name   s!    $qww334566rV   c                    U R                   UR                   :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ rn   )r   r   r   r   s     rT   can_joinWorkspaceArg.can_join   s@     LLALL(XQWW-?XAHHPQPXPXDX	
rV   c           	         [        U R                  UR                  -   [        R                  U R                  UR                  5      U R
                  U R                  U R                  U R                  S9$ N)r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   s     rT   joinWorkspaceArg.join   sS    ''AGG#'//Q[[I''88||||
 	
rV   c           	        U R                   UR                   :X  a4  U R                  UR                  :X  a  U R                  UR                  :X  d   e[        [        R
                  " U R                  UR                  5      [        R                  U R                  UR                  5      U R                   U R                  U R                  U R                  S9$ r   )r   r   r   r   sympyMaxr   r   r   r   r   r   s     rT   maximumWorkspaceArg.maximum   s     GGqww188qxx#7ALLALL<X	
X))AGGQWW-'//Q[[I''88||||
 	
rV   c                    U R                   $ rn   r   ro   s    rT   
get_deviceWorkspaceArg.get_device   s    {{rV   c                    U R                   $ rn   r   ro   s    rT   	get_dtypeWorkspaceArg.get_dtype   s    zzrV   c                >    U R                  5       R                  5       $ rn   )
get_layoutr   ro   s    rT   r   WorkspaceArg.get_example   s     ,,..rV   c                `    SSK Jn  U" U R                  U R                  U R                  /S/S9$ )Nr   )r@   rH   )r   r   sizestride)irr@   r   r   r   )ri   r@   s     rT   r   WorkspaceArg.get_layout   s.    $;;****3	
 	
rV   c                "    U R                  5       $ rn   )r   ro   s    rT   layoutWorkspaceArg.layout   s      rV   c                6    [         R                  R                  $ rn   )r   SZeroro   s    rT   
get_offsetWorkspaceArg.get_offset   s    ww||rV   c                    U R                   /$ rn   )r   ro   s    rT   get_sizeWorkspaceArg.get_size   s    

|rV   c                8    [         R                  R                  /$ rn   )r   r   Onero   s    rT   
get_strideWorkspaceArg.get_stride  s    }rV   c                    U R                   $ rn   )r   ro   s    rT   r   WorkspaceArg.get_name  s    rV   c                    g)NFr   ro   s    rT   get_is_pinnedWorkspaceArg.get_is_pinned	  s    rV   c                    / $ rn   r   ro   s    rT   get_inputs_that_alias_output)WorkspaceArg.get_inputs_that_alias_output  s    	rV   r   N)
workspace_)r   r~   r{   r~   )r   r   r   r   r{   r   )r   r   r   r   r{   r   )r{   r   )r{   r   r   )r{   r@   )r{   r   )r{   list[sympy.Expr]r}   )r{   r   )r{   	list[str]) r   r   r   r   r   r   r   torchuint8r   r   r   r   r   r   r   get_device_or_errorr   r   r   r   r   get_output_specmaybe_get_output_specmaybe_get_layoutr   r   r   r   r   r   r   r   rV   rT   r   r      s    	   OJE;$7 7 
 

 
 
 
 
 %/
 ! ! !O&!rV   r   c                  (    \ rS rSrSS jrSS jrSrg)TritonScratchWorkspacei  c                    Xl         X l        g rn   )r   _generate_dtype_str)ri   r   generate_dtype_strs      rT   __init__TritonScratchWorkspace.__init__  s    	#5 rV   c                "    U R                  5       $ rn   )r   ro   s    rT   r   )TritonScratchWorkspace.generate_dtype_str  s    ''))rV   )r   r   N)r   intr   Callable[..., str]r}   )r   r   r   r   r   r   r   r   rV   rT   r   r     s    6*rV   r   c                  x    \ rS rSr% S\S'   S\S'   S\S'   \R                  R                  rS\S'   S	r	S
\S'   Sr
g	)	TensorArgi  r~   rf   bufferr   r   r   offsetNOptional[str]alias_ofr   )r   r   r   r   r   r   r   r   r  r
  r   r   rV   rT   r  r    s.    
IKFJ%"Hm"rV   r  c                  >    \ rS rSr% S\S'   S\S'   \S	S j5       rSrg)
SizeArgi"  r~   rf   r   exprc                    g rn   r   ro   s    rT   r
  SizeArg.alias_of'  s    rV   r   Nr{   r	  )r   r   r   r   r   r   r
  r   r   rV   rT   r  r  "  s    
I
 rV   r  c                       \ rS rSr% S\S'   Srg)ConstexprArgi,  r~   rf   r   Nr   r   r   r   r   r   r   rV   rT   r  r  ,  s    
IrV   r  c                  >    \ rS rSr% S\S'   S\S'   S\S'   S\S'   S	rg
)TMADescriptorArgi1  r~   rf   api_typezOptional[list[sympy.Expr]]block_shapeOptional[torch.dtype]r   r   Nr  r   rV   rT   r  r  1  s    
IM++  rV   r  c                  F    \ rS rSr% S\S'   S\S'   SrS\S'   SrS\S	'   S
rg)DeviceCodegeni9  SchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNOptional[WrapperConstructor]cpp_wrapper_codegenfx_wrapper_codegenr   )r   r   r   r   r   r   r!  r   r   rV   rT   r  r  9  s&    %%''8<5<7;4;rV   r  zdict[str, DeviceCodegen]device_codegensc                      \ rS rSrSS jrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSS jrSS jrSS jrSS jrSS jrSS jrSS jr S       SS jjrSrg)DeviceOpOverridesiF  c                    [         ern   r   ri   rf   s     rT   import_get_raw_stream_as*DeviceOpOverrides.import_get_raw_stream_asG      !!rV   c                    [         ern   r&  ri   
device_idxs     rT   
set_deviceDeviceOpOverrides.set_deviceJ  r*  rV   c                    [         ern   r&  ro   s    rT   synchronizeDeviceOpOverrides.synchronizeM  r*  rV   c                    [         ern   r&  r,  s     rT   device_guardDeviceOpOverrides.device_guardP  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_device_guard"DeviceOpOverrides.cpp_device_guardS  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_aoti_device_guard'DeviceOpOverrides.cpp_aoti_device_guardV  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_stream_guard"DeviceOpOverrides.cpp_stream_guardY  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_aoti_stream_guard'DeviceOpOverrides.cpp_aoti_stream_guard\  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_getStreamFromExternal+DeviceOpOverrides.cpp_getStreamFromExternal_  r*  rV   c                    [         ern   r&  ro   s    rT   kernel_headerDeviceOpOverrides.kernel_headerb  r*  rV   c                    [         ern   r&  ro   s    rT   kernel_driverDeviceOpOverrides.kernel_drivere  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_stream_type!DeviceOpOverrides.cpp_stream_typeh  r*  rV   c                    [         ern   r&  ro   s    rT   aoti_get_stream!DeviceOpOverrides.aoti_get_streamk  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_kernel_type!DeviceOpOverrides.cpp_kernel_typen  r*  rV   c                    [         ern   r&  ro   s    rT   cpp_device_ptr DeviceOpOverrides.cpp_device_ptrq  r*  rV   c                    [         ern   r&  ro   s    rT   tma_descriptor_helpers(DeviceOpOverrides.tma_descriptor_helperst  r*  rV   Nc                    [         ern   r&  )ri   idx	workspacer   s       rT   cpp_scratchDeviceOpOverrides.cpp_scratchw  s
     "!rV   r   rf   r~   r{   r~   )r-  r  r{   r~   r}   rn   )r[  r  r\  r   r   r	  r{   zOptional[tuple[list[str], str]])r   r   r   r   r(  r.  r1  r4  r7  r:  r=  r@  rC  rF  rI  rL  rO  rR  rU  rX  r]  r   r   rV   rT   r$  r$  F  s    """""""""""""""" TX""#9"CP"	(" "rV   r$  zdict[str, DeviceOpOverrides]device_op_overrides_dictz*dict[str, Optional[CustomGraphModulePass]]custom_backend_passesz!dict[str, Optional[ConfigModule]]custom_backend_codegen_configsc                    [        UUUU5      [        U '   U[        U '   U(       a1  [        U[        5      (       a	  U[
        Ld   SU< S[
        < 35       eU[        U '   g )Nzdevice_custom_config=z: cannot be the same as the default inductor config config=)r  r"  ra  
isinstancer   r   rb  )r   device_schedulingdevice_wrapper_codegendevice_cpp_wrapper_codegendevice_fx_wrapper_codegendevice_custom_passdevice_custom_configs          rT   register_backend_for_devicerk    sy     ,"!	OF %7&!+\::$F2	
 %#%%`Y_Xab		
3
 .B"6*rV   c                      \ rS rSr\" 5       r\" 5       r\" 5       r\" 5       r\" 5       r	\" 5       r
\" 5       r\" 5       r\" 5       r\" 5       rSrg)BackendFeaturei  r   N)r   r   r   r   r   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERTRITON_TEMPLATESREDUCE_TO_SINGLE_ELEMENTr   r   rV   rT   rm  rm    sL    fGIfO $6D6DfO"fv#vrV   rm  c                \   U c
  [        5       $ [        5         [        U [        R                  5      (       a  U R
                  nO=[        U [        5      (       d   [        U 5      5       eU n[        R                  " U5      n [        U5      nU(       d   eU" S 5      nUR                  U 5      $ rn   )	r   init_backend_registrationrd  r   r   typer~   get_scheduling_for_deviceget_backend_features)r   device_typescheduling_ctorr  s       rT   r|  r|    s     ~|&%,,''kk&#&&4V4&k*/<O? &J**622rV   c                L    [        U[        5      (       d   eU[        U 5      ;   $ )zSee also V.graph.has_feature)rd  rm  r|  )r   features     rT   has_backend_featurer    s(     g~....*6222rV   c                @    U [         ;   a  [         U    R                  $ S $ rn   )r"  r  r   s    rT   r{  r{    s     17?1J?6"--TPTTrV   c                    U [         ;   a;  [         U    nU(       a  UR                  $ U(       a  UR                  $ UR                  $ g rn   )r"  r!  r   r  )r   cpp_wrapper
fx_wrapperwrapper_codegen_objs       rT   get_wrapper_codegen_for_devicer    sD      -<V-D&999&:::&666rV   c                ,    [         R                  U 5      $ rn   )ra  getr   s    rT   "get_custom_backend_pass_for_devicer    s     $$V,,rV   c                ,    [         R                  U 5      $ rn   )rb  r  r   s    rT   $get_custom_backend_config_for_devicer    s    )--f55rV   c                 >  ^^ SSK Jn   SSKJn  SSKJn  SSKJn  SSKJ	n  SSK
Jn  SSKJn  SS	KJn  SS
KJn  SSKJn	  SSKJn
  SSKJn  SSKJn  [5        S5      c;  U UU
US.m[7        SU4S jU[8        R:                  R<                  (       a  UOUU5        [5        S5      c  UUUS.m[7        SU4S jUUU5        [5        S5      c  [7        SU
UUU5        [5        S5      c  [7        SUUUU5        [5        S5      c  [7        SU
U	UU5        [>        R@                  RC                  5       nUS:w  a\  [5        U5      cO  SSK"J#n   U" S5      nU" S5      nU" S5      nU" S5      nU(       a   U(       a  U(       a  [7        UUUUU5        gggggg! [H         a     gf = f) z
Register the backend for different devices, including the scheduling
for kernel code generation and the host side wrapper code generation.
rH   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CppWrapperMps)CUDACombinedScheduling)HalideScheduling)MetalScheduling)PallasScheduling)PythonWrapperMtia)TritonSchedulingrI   )WrapperFxCodegencpuN)cpphalidetritonpallasc                6   > T[         R                     " U 5      $ rn   )r   cpu_backend)r  cpu_backendss    rT   <lambda>+init_backend_registration.<locals>.<lambda>  s    |F,>,>?
KrV   cuda)r  r  r  c                6   > T[         R                     " U 5      $ rn   )r   cuda_backend)r  cuda_backendss    rT   r  r  !  s    }V-@-@A*MrV   xpumpsmtiaprivateuseoner   )_get_custom_mod_func
SchedulingrJ   CppWrapperCodegenr  )%r  r  cpp_wrapper_cpur  cpp_wrapper_cpu_array_refr  cpp_wrapper_gpur  cpp_wrapper_mpsr  cuda_combined_schedulingr  r  r  r  r  r  r  python_wrapper_mtiar  r  r  wrapperrJ   wrapper_fxirr  r{  rk  r   aot_inductorallow_stack_allocationr   _C_get_privateuse1_backend_name torch.utils.backend_registrationr  RuntimeError)r  r  r  r  r  r  r  r  r  r  r  rJ   r  private_backendr  re  r  r   r!  r  r  s                      @@rT   ry  ry    s    #.@..@($(6(-. '/ &&&	
 	$K ""99 "	
 !(0 -&&

 	$M 	
 !'/# 	
 !'/# 	
 !(0#	
 hh<<>O?*%o6>I	 4\ B23IJO"67J"K!56H!I _9L+#%#'& :M_  ? 	+$  		s   AF 
FFc                J    SSK Jn  / U Q[        XR                  U5      5      P$ )Nr   )FlexibleLayout)r   r  r,   contiguous_strides)index
index_varssizesr  s       rT   index_prevent_reorderingr  Z  s*    
 $ UUTIj*K*KE*RSTTrV   c                    U[         U '   g rn   )r`  )r   device_op_overridess     rT   register_device_op_overridesr  e  s     (;V$rV   c                    [        U [        5      (       d   [        U 5      5       e[        (       d  SSKJnJn  SSKJn  SSK	Jn  SSK
Jn  [        U    $ )NrH   )cpu_device_op_overridesmps_device_op_overrides)r  )rd  r~   rz  r`   r  r  r  r  r  r  )r   r  r  r  mtia_op_overridesxpu_op_overridess         rT   get_device_op_overridesr  k  s;    fc""0DL0"##F-B@#F++rV   zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEc                   U [        5       ;   a  [        R                  $ U S;   a  SU;   a  US   $ US   $ U S;   a  [        R                  $ U S;   a  [        R                  $ U S:X  a  SU;   a  US   $ US   $ U S:X  a  SU;   a  US   $ US   $ U S	;   a$  US   n[
        R                  R                  U5      $ U S
:X  a  SU;   a  US   $ US   $ g)zC
Given op name and a list of input dtypes, deduce the output dtype
)to_dtype
index_exprr   )randrandn)	get_index	randint64	load_seed	reductionrH   constant)loadstorestore_reductionto_dtype_bitcastN)r%   r   r   floatint64r7   r   r   )op_namert   kwargsbuf_names       rT   deduce_output_dtype_by_namer    s
    +-zz	  
 #*V"3vgAbA	  
 {{	  

 {{	K	")V"3vg@a@	J	")V"3vgAbA	  

 7ww  **	&	&")V"3vgAbArV   CSEVariableTypec                *   [        5       n[        R                  R                  (       a(  US:X  a"  U R	                  SU S[        U5       S35        g [        R                  R                  (       a  US:X  a  SSKJnJ	n  [        X5      (       d   [        U5      5       eU[        R                  :X  a"  UR                  (       a  SU S	3nO2S
U SU S3nO(SU S3nUR                  (       a  SU S3nSU SXR    S3nU R	                  SU S35        g g g )Nr  tl.static_assert(z
.dtype == r   r  rH   )CppCSEVariableDTYPE_TO_CPPzIsVecMaskType<decltype(z	)>::valuezstd::is_same_v<decltype(z$), bool> || std::is_same_v<decltype(z), int>z	decltype(z	typename z::value_typezstd::is_same_v<r   >zstatic_assert(z);)r(   r   test_configsruntime_triton_dtype_assert	writeliner/   static_cpp_dtype_assert	cpp_utilsr  r  rd  rz  r   r   is_vec)r  varr   backendr  r  
is_same_dt
c_var_types           rT   check_dtyper    s    "#G667h;N,SEK<N;OqQR				4	4E9I;#..9S	9.EJJzz6se9E
  8u<`ad`eelm
$SE+Jzz(LA
*:,b9L8MQOJ>*R89! :J	4rV   c                    [        5       nUc   e[        R                  R                  (       aO  US:X  aH  [	        U5      S:w  a  SR                  S U 5       5      OUS    S3nU R                  SU SU S	35        g g g )
Nr  rH   r   c              3  8   #    U  H  n[        U5      v   M     g 7frn   r~   ).0ds     rT   	<genexpr>check_shape.<locals>.<genexpr>  s     ,ec!ffes   r   ,r  z.shape == ()))r(   r   r  runtime_triton_shape_assertlenr   r  )r  r  shaper  	shape_strs        rT   check_shaper    s     "#G667h;N03E
aDII,e,,azQR^ 	 	,SEYKrJK	 <O6rV   c                l    [        5       nUS:X  a$  SnU R                  SU SU SU SU SU S35        g g )	Nr  zNaN or Inf foundztl.device_assert((z == ) & (z != float('inf')) & (z != float('-inf')), 'z'))r(   r  )r  r  r  rS   s       rT   	check_nanr    sS    !#G(  T#eC58McURghkgllno	
 rV   c                  x    \ rS rSrSS jrSS jrSS jrSS jrSS jrSS jr	\
SS j5       r\
SS	 j5       rS
rg)DataTypePropagationi  c                    Xl         SUR                  R                  0U l        UR                  R                  5        H  u  p#UR                  U R                  U'   M      g Nroot)body
root_blockr   graphs	subblocksitems)ri   r  kvs       rT   r   DataTypePropagation.__init__  sL    	DOO))B
 NN((*DAWWDKKN +rV   c                   UR                   nU Vs/ s HB  n[        U[        R                  R                  5      (       d  M.  UR
                  S:w  d  M@  UPMD     nn[        U5      S:X  a  g [        S U 5       5      nU(       d  g [        R                  " [        R                  U Vs/ s H)  o3R                  [        R                     R                  PM+     sn5      $ s  snf s  snf )Nplaceholderr   c              3     #    U  HR  n[         R                  UR                  ;   =(       a)    UR                  [         R                     R                  S Lv   MT     g 7frn   )OptimizationContextkeymetar   )r  ns     rT   r  BDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>  sR      )
 !  ##qvv- B*../55TAB s   AA)all_input_nodesrd  r   fxNodeopr  all	functoolsreducepromote_typesr  r  r  r   )ri   nodeinputsr  input_nodesall_input_nodes_propagateds         rT   deduce_node_dtype_by_inputs/DataTypePropagation.deduce_node_dtype_by_inputs  s    %%
!Auxx}}!=A!$$-BWAv 	 
 {q %( )
 !)
 &
"
 *<GHKqVV'++,22KH
 	

  Is   -C C C )0C%
c                l    U R                   UR                     nU R                  U5      nU(       d   eU$ rn   )r  targetpropagate_graph)ri   r%  	sub_graphr   s       rT   deduce_node_dtype_by_subgraph1DataTypePropagation.deduce_node_dtype_by_subgraph  s0    KK,	$$Y/urV   c                   UR                   S:X  a  g UR                  S:X  a  [        UR                  5      S:w  a  g UR                  [        R
                  L aY  UR                  S   n[        U[        R                  R                  5      (       d   [        U5      5       eU R                  U5      $ [        UR                  [        5      (       d   [        UR                  5      5       eUR                  R                  S5      (       a  U R                  U5      $ [        UR                  /UR                  Q70 UR                   D6=n b  U$ U R#                  U5      $ )Nr  outputrH   r   masked_subblock)r   r,  r  rt   operatorgetitemrd  r   r  r  rz  deduce_node_dtyper~   
startswithr/  r  r  r)  )ri   r%  node_argoutput_dtypes       rT   r6  %DataTypePropagation.deduce_node_dtype	  s$   77m#;;("s499~':;;(***yy|Hh66FXF6))(33$++s++>T$++->>+;;!!"34455d;; 8 ++ L
   //55rV   c                   UR                   (       d   eS nUR                    H  n[        R                  UR                  ;   a  UR                  [        R                     nO
[        5       nU R	                  U5      Ul        XCR                  [        R                  '   UR                  S:X  d  M  UR
                  nM     U$ )Nr2  )nodesr  r  r  r6  r   r,  )ri   r   graph_dtyper%  opt_ctxs        rT   r-  #DataTypePropagation.propagate_graph&  s    {{{-1 KKD"&&$))3))$7$;$;<-/ 2248GM18II)--.{{h&%mm   rV   c                >    U R                  U R                  S   5      $ r  )r-  r  ro   s    rT   	propagateDataTypePropagation.propagate8  s    ##DKK$788rV   c                .    U " U5      R                  5       $ rn   )rA  )clsr  s     rT   propagate_loopbody&DataTypePropagation.propagate_loopbody;  s    4y""$$rV   c                   SSK Jn  SSKJn  [	        X5      (       d   [        U5      5       e[	        UR                  U5      (       d   [        UR                  5      5       e[        R                  UR                  5      $ )Nr   rB   )rF   )		loop_bodyrC   	schedulerrF   rd  rz  _bodyr	  rE  )rD  r%  rC   rF   s       rT   propagate_scheduler_node,DataTypePropagation.propagate_scheduler_node?  s\    (-$..:T
:.$**h//Adjj1AA/"55djjAArV   )r  r  N)r  rC   r{   r|   )r%  torch.fx.Noder{   r  )r%  rM  r{   r   )r   ztorch.fx.Graphr{   r  )r{   r  )r  rC   r{   r  )r%  rF   r{   r  )r   r   r   r   r   r)  r/  r6  r-  rA  classmethodrE  rK  r   r   rV   rT   r	  r	    sJ    %
*6:$9 % % B BrV   r	  c                  X   ^  \ rS rSrSSS.       SU 4S jjjrSS	U 4S jjjrSrU =r$ )
r   iI  T)simplifypc                  > U(       ag  [        U[        R                  5      (       aH  [        [        R
                  S5      (       a)  [        R
                  R                  R                  U5      n[        TU ]%  U5      $ )Nsizevars)
rd  r   Exprhasattrr7   r   rS  rP  superdoprint)ri   r  rP  rQ  	__class__s       rT   rW  PythonPrinter.doprintJ  sQ     
444*9U9U77##,,T2Dwt$$rV   c                   > [        U[        R                  5      (       a  SU R                  U5       S3$ [        TU ]  XU5      $ N(r   )rd  r   Mod_printrV  parenthesize)ri   itemlevelstrictrX  s       rT   r_  PythonPrinter.parenthesizeR  sA    dEII&& t{{4()++7'V<<rV   r   )r  r   rP  r   rQ  r   r{   r~   )F)r`  r   ra  r  rb  r   r{   r~   )r   r   r   r   rW  r_  r   __classcell__rX  s   @rT   r   r   I  s<    48D%%-1%=A%	% %= =rV   r   c                     \ rS rSrSr\SS j5       r\SS j5       r\SS j5       r\SS j5       r	\SS j5       r
\SS j5       r\SS	 j5       r\SS
 j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       r\SS j5       rSrg)OpDecompositionsi[  z
Decomposes inductor ops
c                    U $ rn   r   )rh   s    rT   identityOpDecompositions.identity`  s	     rV   c                v    [         R                  " [         R                  " S[        R                  5      U 5      $ NrH   )r2   truedivr  r   int32xs    rT   
reciprocalOpDecompositions.reciprocale  s"    {{3<<5;;7;;rV   c                .    [         R                  " X 5      $ rn   )r2   mulro  s    rT   squareOpDecompositions.squarei  s    wwq}rV   c                    [         R                  " [         R                  " S[        R                  5      [         R
                  " U 5      5      $ rl  )r2   subr  r   float32erfro  s    rT   erfcOpDecompositions.erfcm  s*    wws||Au}}5swwqzBBrV   c                    [         R                  " [         R                  " [         R                  " U 5      5      [         R                  " U 5      5      $ rn   )r2   rt  expru  r{  ro  s    rT   erfcxOpDecompositions.erfcxq  s,    wwswwszz!}-sxx{;;rV   c                    [         R                  " [         R                  " U 5      [         R                  " S[        R
                  5      5      $ rl  )r2   rx  r~  r  r   ry  ro  s    rT   expm1OpDecompositions.expm1u  s*    wwswwqz3<<5==#ABBrV   c           	         [         R                  " [         R                  " U 5      [         R                  " S[        R                  " S5      -  [
        R                  5      5      $ )NrH   
   r2   rt  logr  mathr   ry  ro  s    rT   log10OpDecompositions.log10y  s7    wwswwqz3<<DHHRL0@%--#PQQrV   c           	         [         R                  " [         R                  " U 5      [         R                  " S[        R                  " S5      -  [
        R                  5      5      $ )NrH   r   r  ro  s    rT   log2OpDecompositions.log2}  s6    wwswwqz3<<DHHQK#OPPrV   c           
         [         R                  " [         R                  " U [         R                  " [        R
                  " S5      [        R                  5      5      5      $ )Nr   )r2   r~  rt  r  r  r  r   ry  ro  s    rT   exp2OpDecompositions.exp2  s3    wwswwq#,,txx{EMM"JKLLrV   c           	         [         R                  " [         R                  " U [         R                  " S[        R
                  5      5      5      $ rl  )r2   r  addr  r   rn  ro  s    rT   log1pOpDecompositions.log1p  s+    wwswwq#,,q%++">?@@rV   c                    [         R                  " S[        R                  5      n[         R                  " U[         R
                  " U[         R                  " [         R                  " U 5      5      5      5      $ rl  )r2   r  r   rn  rm  r  r~  neg)rp  ones     rT   sigmoidOpDecompositions.sigmoid  sC    ll1ekk*{{3SWWSWWQZ-@ ABBrV   c                v    [         R                  " U [         R                  " S[        R                  5      5      $ Nr   )r2   r   r  r   rn  ro  s    rT   reluOpDecompositions.relu  s"    {{1cll1ekk:;;rV   c                X    [         R                  " [         R                  " X5      U5      $ rn   )r2   r  rt  rp  yzs      rT   fmaOpDecompositions.fma  s     wwswwq}a((rV   c                X    [         R                  " [         R                  " U 5      U5      $ rn   )r2   r  floorr   r   s     rT   floor_to_intOpDecompositions.floor_to_int      ||CIIaL%00rV   c                X    [         R                  " [         R                  " U 5      U5      $ rn   )r2   r  ceilr  s     rT   ceil_to_intOpDecompositions.ceil_to_int  s    ||CHHQK//rV   c                X    [         R                  " [         R                  " U 5      U5      $ rn   )r2   r  truncr  s     rT   trunc_to_intOpDecompositions.trunc_to_int  r  rV   c           	        [         R                  " X5      n[         R                  " [         R                  " U[         R                  " S[
        R                  5      5      [         R                  " [         R                  " U5      [         R                  " U5      5      5      n[         R                  " U[         R                  " X!5      U5      $ r  )
r2   modand_ner  r   rn  signbitwherer  )r   r   rconds       rT   	remainderOpDecompositions.remainder  su    GGAMxxFF1cll1ekk23FF3;;q>3;;q>2
 yyswwq}a00rV   c                X    [         R                  " [         R                  " U 5      U5      $ rn   )r2   r  roundr  s     rT   round_to_intOpDecompositions.round_to_int  r  rV   r   N)rh   OpVarTr{   r  rp  r  r{   r  )rp  r  r  r  r  r  r{   r  )r   r  r   r   r{   r  r   r  r   r  r{   r  )r   r   r   r   r   r   ri  rq  ru  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   rV   rT   rg  rg  [  s}      < <   C C < < C C R R Q Q M M A A C C < < ) ) 1 1 0 0 1 1 1 1 1 1rV   rg  z[a-z0-9_.]+|\([^)]*\)|)flagsc                    U S   S:w  d  [        U 5      S:  a  gSn[        U SS  5       H8  u  p#US:X  a  US-  nOUS:X  a  US-  nUS:X  d  M$  U[        U 5      S-
  :w  d  M8    g   US:X  d   eg)Nr   r\  r   FrH   r   T)r  	enumerate)stringr   ichars       rT   _all_in_parensr    s    ayC3v;?EVABZ(3;QJES[QJEA:!s6{Q. ) A::rV   c                     \ rS rSr\S$S j5       r\S%S j5       r\S&S j5       r\S'S j5       r\S(S j5       r	\S(S j5       r
\S(S j5       r\S(S	 j5       r\S(S
 j5       r\S)S j5       r\S*S j5       r  S+         S,S jjr          S-S jrS.S jr S/         S0S jjrS1S jrS2S jr          S3S jr        S4S jr          S5S jr  S6               S7S jjrS8S jrS(S jrS\R:                  SSS.             S9S jjrS:S jrS;S jr \S<S  j5       r!\"S=S! j5       r#\"S>S" j5       r$S#r%g)?OpOverridesi  c                    [        U [        5      (       d*  [        R                  U 5      (       d  [	        U 5      (       a  U $ SU  S3$ r[  )rd  CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchr  )r  s    rT   parenOpOverrides.paren  sB     v{++#--f55f%% M6(!}rV   c                    [        U 5      $ rn   )repr)rh   r   s     rT   r  OpOverrides.constant  s    E{rV   c                2    S[         R                  U 5       3$ )N~r  r  ro  s    rT   bitwise_notOpOverrides.bitwise_not  s    ;$$Q'())rV   c                2    [         R                  U 5       S3$ )Nz == 0r  )r   s    rT   logical_notOpOverrides.logical_not  s    ##A&'u--rV   c                \    [         R                  U 5       S[         R                  U5       3$ )Nz & r  rp  r  s     rT   bitwise_andOpOverrides.bitwise_and  +    ##A&'s;+<+<Q+?*@AArV   c                \    [         R                  U 5       S[         R                  U5       3$ )Nz | r  r  s     rT   
bitwise_orOpOverrides.bitwise_or  r  rV   c                \    [         R                  U 5       S[         R                  U5       3$ )Nz ^ r  r  s     rT   bitwise_xorOpOverrides.bitwise_xor  r  rV   c                \    [         R                  U 5       S[         R                  U5       3$ )Nz << r  r  s     rT   bitwise_left_shiftOpOverrides.bitwise_left_shift  +    ##A&'tK,=,=a,@+ABBrV   c                \    [         R                  U 5       S[         R                  U5       3$ )Nz >> r  r  s     rT   bitwise_right_shiftOpOverrides.bitwise_right_shift  r  rV   c                .    [         R                  " X5      $ rn   )r2   rm  r   s     rT   int_truedivOpOverrides.int_truediv  s    
 {{1  rV   c                X    [         R                  " U [        R                  " U5      5      $ rn   )r2   r  r   Integer)rf   r  s     rT   r  OpOverrides.load_seed  s    xxemmF344rV   Tc                *    [        [        U5      5      $ rn   )r-   r~   )ri   r  r   checkwrap_negs        rT   indirect_indexingOpOverrides.indirect_indexing  s     "#c(++rV   c                D    [        [        U 5      R                   S35      e)Nz,: check_bounds should be handled by CSEProxyr   rz  r   ri   r  r   loweruppers        rT   check_boundsOpOverrides.check_bounds  s'     "Dz""##OP
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz$: load should be handled by CSEProxyr  ri   rf   r  s      rT   r  OpOverrides.load  s%    !Dz""##GH
 	
rV   Nc                D    [        [        U 5      R                   S35      e)Nz%: store should be handled by CSEProxyr  ri   rf   r  rh   r]   s        rT   r  OpOverrides.store  s'     "Dz""##HI
 	
rV   c                D    [        [        U 5      R                   S35      eNz3: device_assert_async should be handled by CSEProxyr  ri   r  rS   s      rT   device_assert_asyncOpOverrides.device_assert_async  %    !Dz""##VW
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz/: store_reduction should be handled by CSEProxyr  ri   rf   r  rh   s       rT   r  OpOverrides.store_reduction  s%    !Dz""##RS
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz): reduction should be handled by CSEProxyr  ri   r   	src_dtypereduction_typerh   s        rT   r  OpOverrides.reduction"  s'     "Dz""##LM
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz$: scan should be handled by CSEProxyr  ri   dtypes
combine_fnvaluess       rT   scanOpOverrides.scan-  s'     "Dz""##GH
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz$: sort should be handled by CSEProxyr  ri   r  r  stable
descendings        rT   sortOpOverrides.sort:  s'     "Dz""##GH
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz): bucketize should be handled by CSEProxyr  ri   r  
boundariesboundary_indicesindexing_dtyperightsortersorter_indicess           rT   	bucketizeOpOverrides.bucketizeE  s'     "Dz""##LM
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz2: halide_clamp only implemented for Halide backendr  )ri   rh   r   r  s       rT   halide_clampOpOverrides.halide_clampS  s%    !Dz""##UV
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz): dot only implemented for Triton backendr  )ri   rp  r  s      rT   dotOpOverrides.dotX  s%    !Dz""##LM
 	
rV   rH   )constraintsr   is_purepackc               D    [        [        U 5      R                   S35      e)Nz<: inline_asm_elementwise only implemented for Triton backendr  )ri   asmr2  r   r3  r4  r&  s          rT   inline_asm_elementwise"OpOverrides.inline_asm_elementwise]  s'     "Dz""##_`
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz.: ops.output should not appear at codegen timeAssertionErrorrz  r   rs   s     rT   r2  OpOverrides.outputj  s%    Dz""##QR
 	
rV   c                D    [        [        U 5      R                   S35      e)Nz3: ops.placeholder should not appear at codegen timer:  ri   r  s     rT   r  OpOverrides.placeholdero  s%    Dz""##VW
 	
rV   c                4   ^  SU 4S jjnT Ul         SUl        U$ )Nc                J   > [        [        U 5      R                   ST 35      e)Nz does not implement ops.r  )ri   rt   r  rf   s      rT   unimplemented1OpOverrides._unimplemented.<locals>.unimplementedv  s*    %:&&''?vF rV   T)ri   r  rt   r	   r  r	   r{   r  )r   is_unimplemented)rf   rB  s   ` rT   _unimplementedOpOverrides._unimplementedt  s     	
 "&)-&rV   c                    [        XS 5      n[        [        US 5      nU(       + =(       d    X#:H  =(       d    [        USS5      $ )NrD  F)getattrr3   )rD  rf   fn
default_fns       rT   _is_unimplementedOpOverrides._is_unimplemented  s;    S%Zt4
vS)SWR9KU-SSrV   c                `   US;   d   U5       e[         R                  5        H  u  p#[        X15      nUc5  U R                  U5      (       a  [	        XU R                  U5      5        MF  MH  X R                  ;  d   SU SU R                   35       eX$l        [	        X[        U5      5        M     g )N)r  r  cppvecr  r  zmultiple definitions of z on )	pointwise_overrides_datar  rH  rK  setattrrE  __dict__r   r   )rD  r,  funcnamedataimpls        rT   _initialize_pointwise_overrides+OpOverrides._initialize_pointwise_overrides  s    EEMvME6<<>NH4(D|((22C3+=+=h+GH 3  ||3 .xjS\\NK3 !)|D'9: ?rV   r   )r  r  r{   r  )rh   zUnion[bool, float, int]r   r   r{   r  r  )r   r  r{   r  )rp  r  r  r  r{   r  r  )rf   r~   r  r  r{   r  TT)
r  r  r   Union[sympy.Expr, int]r  r   r  r   r{   sympy.Symbol
r  r   r   r   r  r   r  r   r{   r|   )rf   r~   r  r   r{   r  rn   )
rf   r~   r  r   rh   r  r]   r6   r{   r|   r  r  rS   r~   r{   r|   )rf   r~   r  r   rh   r  r{   r|   )
r   r   r  r   r  r5   rh   !Union[OpVarT, tuple[OpVarT, ...]]r{   r\  )r  tuple[torch.dtype, ...]r  zFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]r  tuple[OpVarT, ...]r{   r^  )
r  r]  r  r^  r  r   r  r   r{   r^  NN)r  r  r$  .tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r%  r  r&  r   r'  r   r(   Optional[tuple[str, sympy.Expr]]r)  zOptional[OpVarT]r{   r  )rh   r  r   r   r  r   r{   r  )r&  r  r6  r~   r2  r	  r   r   r3  r   r4  r  r{   r  )rt   r  r{   r|   )r  r  r{   r  )rf   r~   r{   zCallable[..., OpVarT]rf   r~   r{   r   )r,  r~   r{   r|   )&r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r   r*  r-  r0  r   ry  r7  r2  r  rE  rN  rK  rU  r   r   rV   rT   r  r    s1   	 	   * * . . B B B B B B C C C C ! ! 5 5 ,, %, 	,
 , 
,

&0
9=
FJ
	

 NR

 *
39
AJ
	




	
	
 	
 &		

 1	
 
+	

'


 #
 

	
'	
 #	
 		

 	
 
	
$ 48+/

 C
 !	

 $
 
 1
 )
 




 &*"]]

 
 #	

 
 
 
 





   T T
 ; ;rV   r  c                      \ rS rSr% S\S'   S\S'   SrS\S'   SrS\S	'   \R                  r	S
\S'   Sr
S\S'   SrS\S'   Srg)OverridesDatai  r~   rf   r  r  NzOptional[Callable[..., str]]r  rN  r   type_promotion_kindr  r  r   )r   r   r   r   r   r  rN  r   DEFAULTre  r  r  r   r   rV   rT   rd  rd    sQ    
I	+/F(/+/F(/'// 8  ,0F(/(,C	%,rV   rd  airy_aic                    SU  S3$ )Nzairy_ai_forward(r   r   ro  s    rT   r  r    s    (1-rV   special_airy_ai)re  r  rf   	bessel_j0c                    SU  S3$ )Nzbessel_j0_forward(r   r   ro  s    rT   r  r        *1#Q/rV   c                    SU  S3$ )Nzlibdevice.j0(r   r   ro  s    rT   r  r        =1-rV   special_bessel_j0)re  r  r  rf   	bessel_j1c                    SU  S3$ )Nzbessel_j1_forward(r   r   ro  s    rT   r  r    rl  rV   c                    SU  S3$ )Nzlibdevice.j1(r   r   ro  s    rT   r  r    rn  rV   special_bessel_j1	bessel_y0c                    SU  S3$ )Nzbessel_y0_forward(r   r   ro  s    rT   r  r    rl  rV   c                    SU  S3$ )Nzlibdevice.y0(r   r   ro  s    rT   r  r    rn  rV   special_bessel_y0	bessel_y1c                    SU  S3$ )Nzbessel_y1_forward(r   r   ro  s    rT   r  r    rl  rV   c                    SU  S3$ )Nzlibdevice.y1(r   r   ro  s    rT   r  r    rn  rV   special_bessel_y1digammac                    SU  S3$ )Nzcalc_digamma(r   r   ro  s    rT   r  r    s    aS*rV   c                    U  S3$ )Nz
.digamma()r   ro  s    rT   r  r    s    A3j)rV   )re  r  rN  rf   r  c                    SU  S3$ )Nzcalc_erfcx(r   r   ro  s    rT   r  r        A3a(rV   c                    SU  S3$ )Nzlibdevice.erfcx(r   r   ro  s    rT   r  r    s    +A3a0rV   special_erfcxr  c                    SU  SU SU S3$ )Nz	std::fma(r   r   r   r  s      rT   r  r    s    is"QCr!A6rV   c                    SU  SU SU S3$ )Nzfmadd(r   r   r   r  s      rT   r  r    s    s"QCr!A6rV   c                    SU  SU SU S3$ )Nzlibdevice.fma(r   r   r   r  s      rT   r  r    s    s"QCr!A>rV   )re  r  rN  r  rf   igammac                    SU  SU S3$ Nzcalc_igamma(r   r   r   r  s     rT   r  r        <s"QCq1rV   igammacc                    SU  SU S3$ Nzcalc_igammac(r   r   r   r  s     rT   r  r        =2aS2rV   gammaincc                    SU  SU S3$ r  r   r  s     rT   r  r    r  rV   special_gammainc	gammainccc                    SU  SU S3$ r  r   r  s     rT   r  r    r  rV   special_gammaincci0c                    SU  S3$ )Nzcalc_i0(r   r   ro  s    rT   r  r        1orV   c                    SU  S3$ Nzlibdevice.cyl_bessel_i0(r   r   ro  s    rT   r  r        3A3a8rV   c                    U  S3$ )Nz.i0()r   ro  s    rT   r  r    s
    A3erV   )re  r  r  rN  rf   i0ec                    SU  S3$ )Nz	calc_i0e(r   r   ro  s    rT   r  r        	!A&rV   c                    U  S3$ )Nz.i0e()r   ro  s    rT   r  r    s
    A3frV   special_i0ei1c                    SU  S3$ )Nzcalc_i1(r   r   ro  s    rT   r  r    r  rV   c                    SU  S3$ Nzlibdevice.cyl_bessel_i1(r   r   ro  s    rT   r  r    r  rV   
special_i1i1ec                    SU  S3$ )Nz	calc_i1e(r   r   ro  s    rT   r  r    r  rV   special_i1elog_ndtrc                    SU  S3$ )Nzcalc_log_ndtr(r   r   ro  s    rT   r  r  	  s    qc+rV   special_log_ndtrmodified_bessel_i0c                    SU  S3$ )Nzmodified_bessel_i0_forward(r   r   ro  s    rT   r  r        3A3a8rV   c                    SU  S3$ r  r   ro  s    rT   r  r    r  rV   special_modified_bessel_i0modified_bessel_i1c                    SU  S3$ )Nzmodified_bessel_i1_forward(r   r   ro  s    rT   r  r    r  rV   c                    SU  S3$ r  r   ro  s    rT   r  r    r  rV   special_modified_bessel_i1modified_bessel_k0c                    SU  S3$ )Nzmodified_bessel_k0_forward(r   r   ro  s    rT   r  r    r  rV   special_modified_bessel_k0modified_bessel_k1c                    SU  S3$ )Nzmodified_bessel_k1_forward(r   r   ro  s    rT   r  r     r  rV   special_modified_bessel_k1ndtrc                    SU  S3$ )Nz
calc_ndtr(r   r   ro  s    rT   r  r  &  s    
1#Q'rV   special_ndtrndtric                    SU  S3$ )Nzcalc_ndtri(r   r   ro  s    rT   r  r  +  r  rV   special_ndtri	polygammac                *    U  SU SU  SU SU SU  S3$ )Nz == 0 ? calc_digamma(z) : (z == 1 ? trigamma(z) : calc_polygamma(r   r  r   r  s     rT   r  r  0  s3    S%aSaS0A!DWXYWZZ\]^\__abrV   scaled_modified_bessel_k0c                    SU  S3$ )Nz"scaled_modified_bessel_k0_forward(r   r   ro  s    rT   r  r  8      :1#Q?rV   !special_scaled_modified_bessel_k0scaled_modified_bessel_k1c                    SU  S3$ )Nz"scaled_modified_bessel_k1_forward(r   r   ro  s    rT   r  r  =  r  rV   !special_scaled_modified_bessel_k1spherical_bessel_j0c                    SU  S3$ )Nzspherical_bessel_j0_forward(r   r   ro  s    rT   r  r  C  s    4QCq9rV   special_spherical_bessel_j0zetac                    SU  SU S3$ )Nzzeta(r   r   r   r  s     rT   r  r  H  s    52aS*rV   special_zetachebyshev_polynomial_tc                    SU  SU S3$ )Nzchebyshev_polynomial_t_forward(r   r   r   r  s     rT   r  r  M      :1#Rs!DrV   special_chebyshev_polynomial_tchebyshev_polynomial_uc                    SU  SU S3$ )Nzchebyshev_polynomial_u_forward(r   r   r   r  s     rT   r  r  R  r  rV   special_chebyshev_polynomial_uchebyshev_polynomial_vc                    SU  SU S3$ )Nzchebyshev_polynomial_v_forward(r   r   r   r  s     rT   r  r  W  r  rV   special_chebyshev_polynomial_vchebyshev_polynomial_wc                    SU  SU S3$ )Nzchebyshev_polynomial_w_forward(r   r   r   r  s     rT   r  r  \  r  rV   special_chebyshev_polynomial_wlegendre_polynomial_pc                    SU  SU S3$ )Nzlegendre_polynomial_p_forward(r   r   r   r  s     rT   r  r  a      9!BqcCrV   special_legendre_polynomial_pshifted_chebyshev_polynomial_tc                    SU  SU S3$ )Nz'shifted_chebyshev_polynomial_t_forward(r   r   r   r  s     rT   r  r  f      B1#Rs!LrV   &special_shifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_uc                    SU  SU S3$ )Nz'shifted_chebyshev_polynomial_u_forward(r   r   r   r  s     rT   r  r  k  r  rV   &special_shifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vc                    SU  SU S3$ )Nz'shifted_chebyshev_polynomial_v_forward(r   r   r   r  s     rT   r  r  p  r  rV   &special_shifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_wc                    SU  SU S3$ )Nz'shifted_chebyshev_polynomial_w_forward(r   r   r   r  s     rT   r  r  u  r  rV   &special_shifted_chebyshev_polynomial_whermite_polynomial_hc                    SU  SU S3$ )Nzhermite_polynomial_h_forward(r   r   r   r  s     rT   r  r  z  s    82aSBrV   special_hermite_polynomial_hhermite_polynomial_hec                    SU  SU S3$ )Nzhermite_polynomial_he_forward(r   r   r   r  s     rT   r  r    r  rV   special_hermite_polynomial_helaguerre_polynomial_lc                    SU  SU S3$ )Nzlaguerre_polynomial_l_forward(r   r   r   r  s     rT   r  r    r  rV   special_laguerre_polynomial_lzdict[str, OverridesData]rO  c                   ^  [        U 4S j[        R                  R                  [        R                  R                  [        R                  R
                  [        R                  R
                  4 5       5      $ )Nc              3  .   >#    U  H
  nTU;   v   M     g 7frn   r   )r  rp  rf   s     rT   r  $is_buffer_removed.<locals>.<genexpr>  s       
A 		
s   )anyr7   r   removed_bufferskernelinplaced_to_removerf   s   `rT   is_buffer_removedr    sU      GG##HH$$GG&&HH''	
  rV   c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DeferredLinei  zHA line that can be 'unwritten' by adding name to V.graph.removed_buffersc                ^   > [         TU ]  U5        Xl        [        U[        5      (       a   eg rn   )rV  r   rf   rd  r&   )ri   rf   linerX  s      rT   r   DeferredLine.__init__  s,    	d$455555rV   c                P    [        U R                  5      (       d  U R                  $ g rn   )r  rf   r  ro   s    rT   __call__DeferredLine.__call__  s     ++99rV   c                .    [        U R                  U5      $ rn   )r   rf   )ri   r  s     rT   	_new_lineDeferredLine._new_line  s    DIIt,,rV   r  )rf   r~   r  r~   r  )r  r~   r{   r   )
r   r   r   r   r   r   r  r  r   rd  re  s   @rT   r   r     s    R6

- -rV   r   c                  "    \ rS rSrSSS jjrSrg)BracesBufferi  c                L   ^ ^ [         R                  SUU 4S jj5       nU" 5       $ )Nc               3    >#    [        T5       H)  n TR                  S5        T=R                  S-  sl        M+     [        T* 5       H)  n T=R                  S-  sl        TR                  S5        M+     S v   [        T* 5       H)  n TR                  S5        T=R                  S-  sl        M+     [        T5       H)  n T=R                  S-  sl        TR                  S5        M+     g 7f)N{rH   })ranger  _indent)_r  ri   s    rT   ctx BracesBuffer.indent.<locals>.ctx  s     6]s#! # F7^!s# $ F7^s#! $ 6]!s# #s   C(C+)r{   Iterator[None])
contextlibcontextmanager)ri   r  r  s   `` rT   indentBracesBuffer.indent  s$    		"	"	$ 
#	$ urV   r   N)rH   )r  r  r{   z'contextlib.AbstractContextManager[None])r   r   r   r   r  r   r   rV   rT   r  r    s     rV   r  c                  *    \ rS rSr% S\S'   S\S'   Srg)InplacedBufferi  r~   r   r   other_namesr   Nr  r   rV   rT   r  r    s    OrV   r  c                  8    \ rS rSr% S\S'   SrS\S'   S
S jrSrg	)ArgNamei  r~   rf   Fr   is_constexprc                L    U R                    U R                  (       a  S 3$ S 3$ )Nz : tl.constexprr  )rf   r  ro   s    rT   	full_nameArgName.full_name  s*    ))$2C2C.LMMLMMrV   r   Nr}   )r   r   r   r   r   r  r!  r   r   rV   rT   r  r    s    
IL$NrV   r  c                      \ rS rSrSS jrSrg)
RemovedArgi  c                    g)NREMOVEDr   ro   s    rT   __str__RemovedArg.__str__  s    rV   r   Nr}   )r   r   r   r   r'  r   r   rV   rT   r$  r$    s    rV   r$  c                  8   \ rS rSr\        SS j5       rSS jrSS jr\SS j5       rSS jr	SS jr
SS jr\R                  4       SS	 jjrSS
 jrS S jrS!S jrS"S jrS#S jrS$S jrS%S jr S&   S'S jjr  S(S jrS)S jrS*S jrS+S jrSrg),
KernelArgsi  c                    UR                  U[        5      n[        U[        5      (       a  U  [	        U5       3=X'   nU$ U$ rn   )r  r&  rd  r$  r  )r   odictrf   result
new_results        rT   _lookupKernelArgs._lookup  sE     */4)Afj))*0#e*'>>EK*rV   c                J    0 U l         0 U l        0 U l        0 U l        / U l        g rn   )input_buffersoutput_buffersinplace_buffersrS  workspace_argsro   s    rT   r   KernelArgs.__init__  s)    -/ACMO/124rV   c                    SR                  SR                  [        [        U R                  U R
                  U R                  U R                  /5      5      5      $ )NzKernelArgs({})r   )formatr   mapr  r2  r3  r4  rS  ro   s    rT   __repr__KernelArgs.__repr__  sS    &&II**++,,	

 	
rV   c                "    [        U [        5      $ rn   )rd  r$  r  s    rT   _buffer_is_marked_removed$KernelArgs._buffer_is_marked_removed  s     $
++rV   c                T   [         R                  R                  (       a3  [         R                  R                  R                  R	                  X5      nU[         R                  R
                  ;  d   U5       eXR                  ;   a  [        [        U R                  U   5      $ XR                  ;   a'  [        [        U R                  U   5      R                  $ UR                  S5      (       a  U R                  SU R                  U5      $ U R                  SU R                  U5      $ )Nseedin_ptr)r7   r   rI  mutation_real_namer  r  r3  r
   r~   r4  r  r   r7  r/  r2  r'  s     rT   inputKernelArgs.input  s    7777$$77;;DGD1772228D82&&&T00677'''(<(<T(BCNNN??6""<<(:(:DAA||Hd&8&8$??rV   c                   [         R                  R                  (       a3  [         R                  R                  R                  R	                  X5      nU[         R                  R
                  ;  d   U5       eXR                  ;   a'  [        [        U R                  U   5      R                  $ U R                  SU R                  U5      $ )Nout_ptr)r7   r   rI  rB  r  r  r4  r
   r  r   r/  r3  r'  s     rT   r2  KernelArgs.output  s    7777$$77;;DGD1772228D82'''(<(<T(BCNNN||It':':DAArV   c                ,   U[         R                  R                  ;   a)  [         R                  R                  R                  U5        X R                  ;  d   U5       eXR                  ;   aP  U R                  U   n[        U[        5      (       a   eUR                  R                  U5        X0R                  U'   g U R                  R                  5        Vs/ s H  n[        U[        5      (       a  M  UPM     nnU R                  R                  5        Vs/ s H  n[        U[        5      (       d  M  UPM     nn[        [        U5      5      [        U5      -   n[        SU 3X/5      nX0R                  U'   X0R                  U'   g s  snf s  snf )N
in_out_ptr)r7   r   unaligned_buffersr  r4  rd  r$  r  appendr  r  r0   r  )ri   
input_nameoutput_namebufvalalive_buffersr  inplace_buffer_idxs           rT   make_inplaceKernelArgs.make_inplace  sh   222GG%%))+6"6"66CC6---&&z2C!#z2222OO"";/03  -  //6688C!#z2 8    //6688Cc:. 8  
 "%VM%:!;c/>R!R /01)C 03  ,03  -!
s   F6FF7Fc                ~   [        U[        R                  U5      [        R                  R                  5       [         R                  5       US9n[        U R                  5       H  u  pV[         R                  Xd5      (       aI  UR                  n[         R                  Xd5      U R                  U'   UR                  UR                  U4s  $ UR                  UR                  :w  a  UR                  UR                  :w  a  M   U5       e   U R                  R                  U5        UR                  UR                  S4$ )a  
Allocate or extend a workspace buffer of nelem elements.

This function manages the allocation of a workspace buffer. It either creates
a new WorkspaceArg or extends an existing one.

Note:
- Calling this function will in-place mutate the args by adding or updating
a WorkspaceArg.
- The codegen for generating the Python argdefs and call_defs will check
this field and allocate the buffer accordingly.
- A new argument "ws_ptr" will be present in the generated code.

Args:
    nelem (sympy.Expr): The number of elements to allocate.
    zero_fill (bool): Whether to initialize the buffer to zero.
    dtype (torch.dtype): the dtype of the workspace tensor

Returns:
    Tuple[str, str, int]: A tuple containing:
        - "ws_ptr": A string identifier for the workspace pointer.
        - "workspace_{i}": agraph level unique identifier for
            the workspace tensor.
        - offset: An integer representing the item offset in the workspace.
)r   r   r   r   r   r   )r   r   r   r7   r   get_current_device_or_throwr   r  r5  r   r   r   r   r   rK  )ri   nelemr   r   argr  existing_argr  s           rT   r\  KernelArgs.workspace+  s
   8 '11)<77668#//1
  ))<)<=OA$$\77%++)5):):<)M##A&#..0G0GOO''3>>9 ++s~~= >  > 	""3'~~s~~q00rV   c           
        [         R                  R                  5       n[        U[        R
                  [        R                  SSUR                   SUR                   3US9nU R                   H,  nUR                  UR                  :X  d  M  X4:X  a  M&   X445       e   U R                  R                  U5        UR                  $ )a  
Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

Warning: multiple calls to this function will return the same buffer.

Args:
    min_size: the number of int32 semaphores required

Returns:
    name of the semaphores buffer
sem_ptrsemaphores_r  )r   r   r   r   r   r   )r7   r   rU  r   r   r   r   uint32rz  r  r5  r   rK  )ri   min_sizecurrent_devicerW  rX  s        rT   
semaphoresKernelArgs.semaphoresZ  s     <<>'66,, $^%8%8$9>;O;O:PQ!
 !//L&&#..8*?S,??* 0 	""3'~~rV   c                z  ^ [        U[        5      (       d   [        U5      U45       e[        R                  " U5      nX R
                  ;   a  U R
                  U   $ TU R
                  R                  5       ;   a1  T [        U4S jU R
                  R                  5        5       5       3mTU R
                  U'   T$ )Nc              3  V   >#    U  H  oR                  T5      (       d  M  S v   M      g7f)rH   N)r7  )r  r  rf   s     rT   r  )KernelArgs.seed_offset.<locals>.<genexpr>~  s     U(>1,,tBTQQ(>s   )	))rd  r  rz  r   r  rS  r  sum)ri   rf   rh   s    ` rT   seed_offsetKernelArgs.seed_offsetv  s    %%%;UU';;%e$MM!==''4==''))&U(<(<(>UUVW   $erV   c                    [        U[        R                  5      (       d   [        U5      U45       eUR                  S:X  a  SU R
                  U'   gU R                  SU R
                  U5      $ )Nr@  ks)rd  r   Symbolrz  rf   rS  r/  r'  s     rT   r   KernelArgs.size  s[    $--AT
D/AA-99"(DMM$||D$--66rV   c                    [        U R                  R                  5       U R                  R                  5       U R                  R                  5       5      $ rn   )r   r2  keysr3  rS  ro   s    rT   
call_namesKernelArgs.call_names  sA    ##%t':':'?'?'A4==CUCUCW
 	
rV   c                &   U R                   R                  US5      nUb!  [        U[        5      (       d  UR                  $ U R
                  R                  US5      nUb  [        U[        5      (       d  U$ U R                  R                  US5      $ )z+
Returns inner name of a given outer name.
N)r4  r  rd  r$  r   r3  r2  )ri   rf   inplacedrM  s       rT   arg_nameKernelArgs.arg_name  s     ''++D$7
8Z(H(H&&&))--dD9":k:+N+N!!%%dD11rV   c                    U$ rn   r   )ri   rN  r   s      rT   wrap_ptr_argKernelArgs.wrap_ptr_arg  s    
rV   c                    [        U5      $ rn   r  )ri   r   s     rT   wrap_size_argKernelArgs.wrap_size_arg  s    4yrV   Nc                   SSK Jn  Uc  SSK Jn  Un/ n/ n/ n[        U R                  R                  5       5       H  n[        U[        5      (       a  M  UR                  S   nUR                  n	[        R                  R                  U5      n
X   nUR                  U SU	 35        UR                  U R                  X5      5        UR                  U S35        M     U R                  R!                  5        H  u  pXR                  ;   a  M  [        R                  R                  U5      n
X   nUR                  SU SU	 35        UR                  U R                  X5      5        UR                  SU S35        M     U R"                  R!                  5        H  u  pXR                  ;   d  [        U[        5      (       a  M+  [        R                  R                  U5      n
X   nUR                  U SU 35        UR                  U R                  X5      5        UR                  U S35        M     U R$                  R!                  5        H  u  p[        U[&        R(                  5      (       aE  [+        U[,        R.                  5      (       a&  UR                  SU	 35        UR                  S	5        O+UR                  SU S
U	 35        UR                  SU 35        UR                  U R1                  U5      5        [        R                  R2                  (       d  M  [        R                  R2                  R5                  U5        GM     U R6                  (       a   S5       eXTU4$ )NrH   )
INDEX_TYPE)r  r  z* *zconst zconst float zconst float zWorkspace not supported on CPU )r  r{  r  r0   r4  r  rd  r$  r  r   r7   r   r   rK  ru  r2  r  r3  rS  r   rj  r   r   UNBACKED_FLOATrx  wrapper_codeensure_size_computedr5  )ri   dtype_to_cpp_typer{  r  	call_argsarg_defs	arg_typesrq  outerinnerr   	cpp_dtypemaybe_inners                rT   cpp_argdefsKernelArgs.cpp_argdefs  s    	*$/ ,		t33::<=H(J//((,E''EGG%%e,E)0IOOykE734T..u<=	{!_- > !..446LE,,,GG%%e,E)0IOOfYKr%9:T..u<=vi[23 7 #'"5"5";";"=E,,,
;
0S0SGG%%e,E)0IOOykK=9:T..u<=	{!_- #> !MM//1LE%..>++4 4 ,ug 67  /&AeW =>  6*!67T//67ww###$$99%@ 2 &&I(II&I--rV   c                   / n/ n/ n/ n[        U R                  R                  5       5       H  n[        U[        5      (       a  M  UR                  [        UR                  5      5        UR                  UR                  S   5        UR                  [        R                  R                  UR                  S   5      5        UR                  [        UR                  UR                  S   [        R                  R                  UR                  S   5      S95        M     [        U R                  R                  5       U R                   R                  5       5       H  u  pgX`R                  ;   d  [        U[        5      (       a  M+  UR                  [        U5      5        UR                  U5        UR                  [        R                  R                  U5      5        UR                  [        UU[        R                  R                  U5      S95        M     U R"                  R                  5        H  u  pgUR                  [        U5      5        UR                  U5        UR                  [%        U5      5        UR                  ['        Xv5      5        [        R                  R(                  (       d  M  [        R                  R(                  R+                  U5        M     U R,                   Hn  nUR                  [        UR                  5      5        UR                  UR.                  5        UR                  U5        UR                  UR0                  5        Mp     XXC4$ )Nr  )rf   r  r   )r0   r4  r  rd  r$  rK  r  r   r  r7   r   r   r  r   r2  r  r3  rS  rz  r  r  r  r5  r   r   )	ri   r  r  r  precompile_argsrq  r  r  rW  s	            rT   python_argdefsKernelArgs.python_argdefs  s    #%!	!	/1t33::<=H(J//OOGH$7$789X11"56QWW..x/C/CB/GHI""!,,#//3''++H,@,@,DE > "$$&%%'
LE
 ,,,
5*0M0MOOGEN+U#QWW..u56"" ''++E2
" !MM//1LEOOGEN+U#T%[)""75#89ww###$$99%@ 2 &&COOGCNN34S^^,""3'SYY'	 '
 O>>rV   c              #     #    [        U R                  R                  5       5       H  n[        U[        5      (       a  M  UR
                   H  nU[        R                  R                  ;   d  U[        R                  R                  ;   a  MA  X R                  ;   a  U R                  U   UR                  4v   X R                  ;   d  M~  [        [        U R                  U   5      UR                  4v   M     M     g 7frn   )r0   r4  r  rd  r$  r  r7   r   r  r  r2  r   r3  r
   r~   )ri   rq  others      rT   aliasesKernelArgs.aliases
  s     t33::<=H(J//!--QWW777 ; ;;...,,U3X5H5HHH///sD$7$7$>?ATATTT . >s   CC>
4C>c                    [        U R                  R                  U[        5      [        5      =(       a.    [        U R
                  R                  U[        5      [        5      $ rn   )rd  r3  r  r&  r$  r4  r'  s     rT   
is_removedKernelArgs.is_removed  sK    ##D'2J
 N--11$@*M	NrV   c                   [        5       n[        U R                  R                  5       5       H8  n[	        U[
        5      (       a  M  UR                  UR                  S   5        M:     U R                  R                  5        H<  u  p4X0R                  ;   d  [	        U[
        5      (       a  M+  UR                  U5        M>     U$ )Nr  )
r   r0   r4  r  rd  r$  r  r  r3  r  )ri   	live_outsrq  r  r  s        rT   live_output_buffersKernelArgs.live_output_buffers!  s    %/\	t33::<=H(J//MM(..r23 > !//557LE,,,
5*0M0MMM%  8 rV   )r4  r2  r3  rS  r5  )r   r~   r,  z6Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]]rf   rK   r{   r~   rz   r}   )rf   r	   r{   r   r_  )rL  r~   rM  r~   r{   r|   )rV  r   r   r   r   r   r{   ztuple[str, str, int])r^  r   r{   r~   )rf   r~   rh   r  r{   r~   )rf   rY  r{   r~   )r{   zIterator[str])rf   r~   r{   r	  )rN  r~   r   r   r{   r~   )r   
SymbolLiker{   r~   rn   )r  z Optional[dict[torch.dtype, str]]r{   z&tuple[list[str], list[str], list[str]])r{   z?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]])r{   zIterator[tuple[str, str]]rb  )r{   zOrderedSet[str])r   r   r   r   r   r/  r   r:  r=  rC  r2  rR  r   r   r\  r`  rf  r   rn  rr  ru  rx  r  r  r  r  r  r   r   rV   rT   r*  r*    s    		E	 	 
		 	5
 , ,
@B4: HM{{-1-1,0-19D-1	-1^87


2 EI4.!A4.	/4.l1?	H1?fUN
rV   r*  c                  x   ^  \ rS rSrSr  S
       SU 4S jjjrSS jrSS jrSS jrSS jr	SS jr
S	rU =r$ )r  i.  a4  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
To do so, the backends can simply overload `Kernel.create_cse_var`
The "CSEVariable.update_on_args" method gives you a hook for annotations
See example of TritonCSEVariable in triton.py
c                   > [         TU ]  5         [        U[        5      (       d   [	        U5      5       eXl        X l        SU l        X0l        X@l	        g rl  )
rV  r   rd  r   rz  rf   bounds	use_countr   r  )ri   rf   r  r   r  rX  s        rT   r   CSEVariable.__init__5  sG     	&+..<V<.	

rV   c                    U R                   $ rn   r  ro   s    rT   r'  CSEVariable.__str__D  s    yyrV   c                ,    [        U R                  5      $ rn   )hashrf   ro   s    rT   __hash__CSEVariable.__hash__G  s    DIIrV   c                b    [        U[        5      =(       a    UR                  U R                  :H  $ rn   )rd  r  rf   )ri   r  s     rT   __eq__CSEVariable.__eq__J  s!    %-I%**		2IIrV   c                    g rn   r   )ri   rf   rt   r  s       rT   update_on_argsCSEVariable.update_on_argsM  s    rV   c                P    U R                   R                   SU R                  < S3$ r[  )rX  r   rf   ro   s    rT   r:  CSEVariable.__repr__P  s$    ..))*!DII=::rV   )r  r   rf   r  r  r_  )rf   r~   r  ValueRanges[Any]r   r  r  rG   r}   )r{   r  )r  objectr{   r   )rf   r~   rt   r	   r  r	   r{   r|   )r   r   r   r   r   r   r'  r  r  r  r:  r   rd  re  s   @rT   r  r  .  s_     (, $ ! %	
  J; ;rV   r  AugmentedKeyT)default)boundr  .c                  x   \ rS rSrSr       S             SS jjrSS jrSS jrSS jrSS jr	SS	 jr
SS
 jrSS jrSS jr\R                  " 5       SSSSS.               SS jjr\R                  " 5       SS4       SS jjr\R                  " 5       SS4         SS jjrSrg)CSEi_  z Common subexpression eliminationNc                    Xl         X l        0 U l        X0l        U=(       d    0 U l        U=(       d    0 U l        U=(       d    [        R                  " 5       U l        [        5       U l
        U=(       d    0 U l        g rn   )r   r^   _cachename_prefixstore_cachereduction_cache	itertoolsr   iter_buffer_idsr   invalidated_storesvarname_map)ri   r   r^   r  iter_buffersr  r  r  s           rT   r   CSE.__init__b  sg     FH&ALARPR!r 	 6B5VY__EV3=<7B7HbrV   c                L   / U R                   R                  5       Q H4  u  p#X1;  d  M  U R                   U	 U R                  R                  U5        M6     U(       a<  U R                  R                  5        VVs0 s H  u  pEXQ;   d  M  XE_M     snnU l        g 0 U l        g s  snnf rn   )r  r  r  r  r  )ri   	keep_varsrf   tmpr  r  s         rT   
invalidateCSE.invalidatez  s    44++1134ID#$$T*''++D1 5 ,0KK,=,=,?R,?DA1>414,?RDKDK Ss   9B B c           
         [        U 5      " U R                  U R                  U R                  U R                  U R
                  U R                  U R                  S9$ )N)r   r^   r  r  r  r  r  )rz  r   r^   r  r  r  r  r  ro   s    rT   clone	CSE.clone  sN    Dz;;;;((--(((( 00
 	
rV   c                    U R                  5       n[        U R                  5      Ul        [        U R                  5      Ul        [        U R                  5      Ul        U$ )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)r  r+   r  r  r  )ri   new_cses     rT   scoped_copyCSE.scoped_copy  sH    **,#DKK0",T-A-A"B()9)9:rV   c                "    [        [        U5      $ )z@Override this method to augment cache key with backend specifics)r
   r  ri   	cache_keys     rT   augment_keyCSE.augment_key  s    M9--rV   c                >    X R                   U R                  U5      '   g rn   r  r  )ri   r  rO  s      rT   putCSE.put  s    36D$$Y/0rV   c                >    U R                  U5      U R                  ;   $ rn   )r  r  r  s     rT   containsCSE.contains  s    	*dkk99rV   c                X    U R                   R                  U R                  U5      S 5      $ rn   )r  r  r  r  s     rT   try_getCSE.try_get  s"    {{t//	:DAArV   c                >    U R                   U R                  U5         $ rn   r  r  s     rT   r  CSE.get  s    {{4++I677rV   T)r  rg   
assignmentr   r  c          	     p   [        U[        5      (       a  UR                  nU(       d	  U(       d   e[        U[        5      (       aE  UR                  R                  U5      Ul        U=R                  S-  sl        [        [        U5      $ [        U[        5      (       a  UR                  5       nO;[        U[        5      (       a  UR                  nO[        U[        5      (       d   eUnU R                  U5      n	Uc	  U(       d  SnU	(       Gd  U R                  X6U5      n	U R!                  X5        U(       Ga  ["        R$                  R&                  (       a(  ["        R$                  R&                  R)                  USS9  [        U[        5      (       aU  U(       a   UR+                  U R,                   U	 S35        UR/                  U5        UR+                  U R0                  5        U	$ [        U[        5      (       aR  U(       d   eUR+                  UR3                  U R,                   U	 SUR                   U R0                   35      5        U	$ U(       a   U R,                   U	 SU U R0                   3n
OU U R0                   3n
UR+                  U
5        U(       a[  [4        R6                  R8                  (       d  [4        R6                  R:                  (       a  Ub  [=        5       S:w  a  [?        XU5        U	$ U	R                  R                  U5      U	l        U	=R                  S-  sl        U	$ )NrH   r   T)	only_oncez =z = r  ) rd  r4   rh   r  r  tightenr  r
   r  r)   getvaluer&   r  r~   r  newvarr  r7   r  current_nodecodegen_originating_infor  r   splicer^   r  r   r  r  r  r(   r  )ri   r  r  r  rg   r  r   r  r  r  r  s              rT   generateCSE.generate  s{    dH%%::D
""dK(( ++--f5DKNNaN..n--I.//		IdC((((Ill9%= E++fU3CHHY$88((HH))BB$ C  dN33!((DKK=R)@AMM$'$$T[[1: 
9  &677%%:$$$++se3tyyk$++'WX4 
- ""&++se3tfT[[MJ"&}5$$T* #"//KK%22JJ!-/1U:#F7 
 ++F3CJMMQM
rV   c                    U R                    [        U R                  5       3n[        R                  R                  XAX#5      nXPR                  U'   U$ rn   )r  r   r  r7   r  create_cse_varr  )ri   r  r   r  var_namer  s         rT   r  
CSE.newvar  sN     &&'T-A-A(B'CDhh%%hE%("
rV   c                   ^ [         R                  " TU R                  ;  U4S j5        [        R                  R                  TX#U5      nXPR                  T'   U$ )Nc                    > ST  3$ )Nzduplicate name: r   r  s   rT   r  CSE.namedvar.<locals>.<lambda>  s    4DTF2KrV   )r   _check_valuer  r7   r  r  )ri   rf   r  r   r  r  s    `    rT   namedvarCSE.namedvar  sQ     	(((*K	
 hh%%dF5A!$
rV   )	r  r  r  r  r   r  r  r^   r  )r  r  r  NNNN)r   r~   r^   r~   r  r~   r  zOptional[itertools.count[int]]r  z.Optional[MutableMapping[str, CSEVariableType]]r  z<Optional[MutableMapping[ReductionCacheKey, CSEVariableType]]r  z$Optional[dict[str, CSEVariableType]])r  zOrderedSet[CSEVariable]r{   r|   r{   r   )r  r~   r{   r  )r  r~   rO  r  r{   r|   )r  r~   r{   r   )r  r~   r{   zOptional[CSEVariableType])r  r~   r{   r  )r  r)   r  zCUnion[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase]r  r  rg   r   r  r   r   r  r  rG   r{   r  )r  r  r   r  r  rG   r{   r  )
rf   r~   r  r  r   r  r  rG   r{   r  )r   r   r   r   r   r   r  r  r  r  r  r  r  r  r   unknownr  r  r  r   r   rV   rT   r  r  _  s   *  7;FJ <@II I 	I
 5I DI
I :I0	
.7:B8 $/#6#6#8'+ $KK RK
 !K K K %K K 
K^ $/#6#6#8'+ $		 	 %	 		
 
	 $/#6#6#8'+ $ ! %	
  
 rV   r  c                  @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	CodeGeni  c                V   > [         TU ]  5         [        R                  " 5       U l        g rn   )rV  r   r  	ExitStack
exit_stackri   rX  s    rT   r   CodeGen.__init__  s    $..0rV   c                :    U R                   R                  5         U $ rn   )r  	__enter__ro   s    rT   r  CodeGen.__enter__  s    !!#rV   c                <    U R                   R                  XU5        g rn   )r  __exit__)ri   exc_typeexc_valexc_tbs       rT   r  CodeGen.__exit__  s      F;rV   )r  rz   r  r   r	   r  r	   r  r	   r{   r|   )	r   r   r   r   r   r  r  r   rd  re  s   @rT   r  r    s    1< <rV   r  c                    ^  \ rS rSr% SrS\S'   SrS\S'   SrS\S'    S$     S%U 4S	 jjjr\	R                  S&S
 j5       r\	R                    S'       S(S jj5       rS)S jrS)S jrS*S jr S+         S,S jjrS-S jr          S.S jr          S/S jr        S0S jr          S1S jrS2S jr  S'               S3S jjr\S4S j5       r S+         S5S jjr          S6S jrS7S jrS8U 4S jjrS9U 4S jjrS:S jrS;S jr S;S jr!    S<S  jr"S=S! jr#S>S" jr$S#r%U =r&$ )?Kerneli  r  r~   newvar_prefixr^   Nz'Optional[Callable[[], OpsHandler[Any]]]	overridesc                H  > [         TU ]  5         U(       a  [        =R                  S-  sl        U=(       d
    [	        5       U l        [        5       U l        [        5       U l        [        5       U l	        SU l
        SU l        SU l        SU l        [        U R                  U R                   5      U l        [%        5       U l        [%        5       U l        S U l        S U l        S U l        S U l        [%        5       U l        [%        5       U l        0 U l        SU l        S U l        g )NrH   Fr   )rV  r   r    generated_kernel_countr*  rt   r)   loadscomputestoresatomic_add_foundnum_load	num_storenum_reductionr  r  r^   cser   must_keep_buffersstore_buffer_names
_load_mask_load_otherr  node_to_boundsr  r  inplace_update_buffersmin_elem_per_threadkernel_name)ri   rt   increase_kernel_countrX  s      rT   r   Kernel.__init__   s     	 **a/*(JL	#%
%'$& %.1$2D2Ddkk.R2<,3=<)-4859OS0:3=<
 79##$ *.rV   c              #     #    U R                   nXl         UR                  R                  5       R                  5       U l         S v   X l         g ! X l         f = f7frn   )r  rJ  r  
get_boundsr  )ri   r%  priors      rT   set_current_nodeKernel.set_current_nodeE  sH     !! "jj//1<<>	& %s   A AA AAAc              #    #    Uc  UnUS L =n(       a
  [        5       nU R                  nU R                  nU R                  nU R                  nXl        X l        X0l        UR                  5       U l         S v   XPl        X`l        Xpl        Xl        U(       a  U(       a   S5       eg g ! XPl        X`l        Xpl        Xl        U(       a  U(       a   S5       ef f = f7f)Nz$unexpected store inside swap_buffers)r)   r  r  r  r  r  )	ri   lbcbsbdisallow_storesr  r  r  r  s	            rT   swap_buffersKernel.swap_buffersO  s      :B Dj(?(!B

,,hh
??$		FJ"L KHEEEv2  J"L KHEEEv2 s   A2C5B( 9/C(0CCc                    [         ern   r&  r  s      rT   r  Kernel.loadm  r*  rV   c                    U R                   n U R                  U l         U R                  X5      X0l         $ ! X0l         f = f)z+A load the depends on an index we have read)r  r  r  )ri   rf   r  r  s       rT   indirect_loadKernel.indirect_loadp  s2    

	DJ99T)JJs   !6 >c                    [         ern   r&  r  s       rT   r  Kernel.store_reductionz  r*  rV   c                    [         ern   r&  r  s        rT   r  Kernel.store}  
     "!rV   c                D    [        [        U 5      R                   S35      er  r  r	  s      rT   r
  Kernel.device_assert_async  r  rV   c                    [         ern   r&  r  s        rT   r  Kernel.reduction  
     "!rV   c                    [         ern   r&  )ri   rf   r  rh   
extra_metas        rT   partial_accumulateKernel.partial_accumulate  r7  rV   c                    [         ern   r&  r  s       rT   r  Kernel.scan  s
     "!rV   c                    [         ern   r&  r  s        rT   r   Kernel.sort  r7  rV   c                    [         ern   r&  ro   s    rT   
var_rangesKernel.var_ranges  r*  rV   c                    [         e)z#
See [Note: Inductor bucketize op]
r&  r#  s           rT   r*  Kernel.bucketize  s
     "!rV   c                    [         ern   r&  ro   s    rT   assert_functionKernel.assert_function  s    !!rV   c           	        [        U[        5      (       a  [        U5      n[        U[        5      (       d   [        U5      5       eUb  [        U[        5      (       d   eUb  [        U[        5      (       d   eU(       a!  U(       a  SU SU SU SU S3	nU SU SU 3nO#U(       a
  U SU 3nUnOU(       d   eU SU 3nUnU(       a	  SU SU S3nU R                   SU SU S3$ )	Nr\  z <= r  z < r   z) | ~(z, "index out of bounds: z"))rd  r  r~   rz  rF  )ri   r  r  r  maskr  
cond_prints          rT   indirect_assertKernel.indirect_assert  s    c;''c(C#s##.T#Y.#}
5# 6 666}
5# 6 666U ugT#eC5E7!<D!7$se3ug6JWD&DJL5U#eW%DJtfF4&*D&&'q.FzlRTUUrV   c                    [         ern   r&  r  s        rT   r  Kernel.check_bounds  r2  rV   c                    [         ern   r&  r>  s     rT   index_to_strKernel.index_to_str  r*  rV   c           	     4  > [         TU ]  5         U R                  (       d   eU R                  R	                  [
        R                  " [        X R                  5       5      5      5        U R                  R	                  [
        R                  " U 5      5        U $ rn   )	rV  r  r  r  enter_contextr7   set_ops_handlerCSEProxyset_kernel_handlerr  s    rT   r  Kernel.__enter__  sj    ~~~%%ht^^-=>?	
 	%%a&:&:4&@ArV   c                F   > U R                  5         [        TU ]	  XU5        g rn   )remove_kernel_local_buffersrV  r  )ri   r   r  r  rX  s       rT   r  Kernel.__exit__  s    ((*F3rV   c                  ^^ [         R                  R                  mT(       d  g[        U4S jU R                   5       5      n[        5       mU R                   Hm  nX R
                  ;  d  M  X R                  R                  ;  d  M/  TR                  X!5      (       d  MG  U =R                  S-  sl	        TR                  U5        Mo     T H  nX R                  R                  ;   a  U R                  R                  U   n[        U[        5      (       a  ML  [        U4S jUR                   5       5      nU(       a  U R!                  U5        U R"                  R                  U5        M  U R%                  U5        M     g)z
Any buffers that are both created and have a last use in the
same kernel can be removed.

Note that V.graph.scheduler can be None when codegening triton template
kernels.
Nc              3     >#    U  H4  nUTR                   ;   d  M  TR                   U   R                  5       v   M6     g 7frn   )name_to_bufdefining_op_name)r  rN  rI  s     rT   r  5Kernel.remove_kernel_local_buffers.<locals>.<genexpr> 	  s?      &
.i+++ :I!!#&7799.s   ?$?rH   c              3  ,   >#    U  H	  oT;   v   M     g 7frn   r   )r  r  names_to_removes     rT   r  r_  	  s     K?a/1?s   )r7   r   rI  r   r  r  rt   r2  $can_buffer_be_removed_through_fusionr  r  r4  rd  r$  r!  r  remove_inplace_bufferr  remove_buffer)ri   fused_node_namesrf   rN  re   ra  rI  s        @@rT   rY  "Kernel.remove_kernel_local_buffers  s(    GG%%	% &
..&
 

 ,6<++D222		 7 77BB  !###D) , $Dyy000ii//5c:..K3??KK..t4''++D1""4( $rV   c                    [         R                  SU5        [        U R                  R                  U'   U R
                  R                  U5        g )Nzremove_buffer(%r))r  rR   r&  rt   r3  r  r  r'  s     rT   rd  Kernel.remove_buffer	  s;     			%t,)0		  &  &rV   c                    [         R                  SU5        [        U R                  R                  U'   U R
                  R                  U5        g )Nzremoving_inplace_buffer(%r))r  rR   r&  rt   r4  r  r  r'  s     rT   rc  Kernel.remove_inplace_buffer%	  s9    		/6*1		!!$'  &rV   c           
        [        U[        [        45      (       a!  U Vs/ s H  o R                  U5      PM     sn$ [        R
                  R                  R                  U5      n[        UR                  S S9nU Vs0 s Hm  n[        U[        R                  [        R                  [        R                  [        R                  45      (       d  MR  X R                   R#                  U5      _Mo     nn[%        X5      $ s  snf s  snf )Nc                    U R                   $ rn   r  )ss    rT   r  (Kernel.rename_indexing.<locals>.<lambda>2	  s    !&&rV   )r  )rd  listtuplerename_indexingr7   r   rS  rP  sortedfree_symbolsr   r   UNBACKED_INTSIZEPRECOMPUTED_SIZEr~  rt   r   r.   )ri   r  rp  sorted_symbolsreplacementss        rT   rq  Kernel.rename_indexing*	  s    
 edE]++5:;U((+U;;  ))%0 2 28HI $
#%%II))''	 !Ayy~~a  # 	 
 %..! <
s   D ?ADDc                    [        U0 UD6$ rn   )r  )ri   rt   r  s      rT   r  Kernel.create_cse_varB	  s    D+F++rV   c                \    Uc  gU R                   R                  UR                  5       5      $ )z3
Returns arg name of a given input or output node.
N)rt   rr  r   )ri   r%  s     rT   rr  Kernel.arg_nameE	  s'     <yy!!$--/22rV   )r  r  rt   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )NT)rt   zOptional[KernelArgs]r  r   r{   r|   )r%  rF   r{   r  r_  )r#  r)   r$  Optional[IndentedBuffer]r%  r~  r{   r  rf   r~   r  r   r{   r  rf   r~   r  r   rh   r  r{   r|   rn   
rf   r~   r  r   rh   r  r]   r6   r{   r|   r[  
r   r   r  r   r  r5   rh   +Union[CSEVariable, tuple[CSEVariable, ...]]r{   r  )
rf   r~   r  r5   rh   r  r9  dict[str, Any]r{   r|   r  r]  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]r{   r  
r  r]  r  r  r  r   r  r   r{   r  )r{   zdict[sympy.Symbol, sympy.Expr]r  r  r$  r`  r%  r  r&  r   r'  r   r(  ra  r)  zOptional[CSEVariable]r{   r  r}   )
r  zUnion[CSEVariable, str]r  r	  r  r	  rI  z!Optional[Union[CSEVariable, str]]r{   r~   rZ  )r  r   r{   r~   r  r  rz   )rf   r~   r{   r|   )r  z;Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]r{   r   )rt   r	   r  r	   r{   r  )r%  rA   r{   r	  )'r   r   r   r   r  r   r^   r  r   r  r  r   r'  r  r,  r  r  r
  r  r:  r  r   rA  r*  r   rF  rK  r  rP  r  r  rY  rd  rc  rq  r  rr  r   rd  re  s   @rT   r  r    s   M3FC9=I6= PT#/(#/HL#/	#/ #/J & &  (,'+	FF %F %	F
 
F F:"" SW"" *"3>"FO"	"


"" " &	"
 ;" 
5""" &" 	"
 #" 
""'"
" (" 
!""'" (" 	"
 " 
!"" 4804"" C" &	"
 $" " 1" ." 
" " " 37V$V V 	V
 0V 
V<""&0"9="FJ"	"
"4&)P''
/P/	/0,3 3rV   r  c                  @    \ rS rSr% SrS\S'   SrS\S'   SrS	\S
'   Srg)r  iN	  r>  zClassVar[str]r  Nr  r   r  r~   ops_namer   )	r   r   r   r   r  r   r   r  r   r   rV   rT   r  r  N	  s!    "C"#'E 'HcrV   r  c                 ^     SS K n U R                  U R                  S9$ ! [         a     g f = f)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  s    rT   
jinja2_envr  V	  s?    !!,, " 
 	
  s    
,,c                      \ rS rSrSr\ S       SS jj5       r\SS j5       r\    SS j5       rSSS jjr	\
SS j5       r\
SS	 j5       rSS
 jr      SS jrSS jrSrg)KernelTemplateib	  z[
Base class for defining kernel templates.

Children classes: TritonTemplate, CUDATemplate
c                    U R                  S5      n[        U5      S:  a"  USS   Vs/ s H  nSU-  U-  U-   PM     snUSS & SR                  U5      $ s  snf )NTrH   r}  r  )
splitlinesr  r   )sourcenum_indentsindents_spacinglinesr  s        rT   indent_except_first"KernelTemplate.indent_except_firsti	  sh     !!$'u:>INqrIR&4<E!"I wwu~s   Ac                    [        5       nUc  g [        R                  UR                  S'   SSKJn   UR                  U 5      $ ! U a  n " S SU5      nU" U5      UeS nAff = f)Nr  r   )TemplateSyntaxErrorc                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )IKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrori	  c                   > [         TU ]  UR                  UR                  UR                  UR
                  5        Xl        g rn   )rV  r   messagelinenorf   filenameoriginal_error)ri   r  rX  s     rT   r   RKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__	  s<    G$&..&--&++&//	 +9'rV   c                X   SU R                    S3nUSU R                   S3-  n[        U R                  S5      (       a  U R                  R                  R                  S5      nUS-  n[        SU R                   S-
  5      n[        [        U5      U R                   S-   5      n[        X45       Hw  nXPR                   S-
  :X  aS  XS-    S	X%    S3-  n[        U R                  S
5      (       a'  USSU R                  R                  S-
  -  -   S-   -  nMf  Mh  XS-    SX%    S3-  nMy     U$ )NzError in template at line 
zError message: r  z	Context:
r   r   rH   z: --> columnz     r}  z^
z:     )r  r  rU  r  r  splitmaxminr  r  r  )ri   
error_infor  startendr  s         rT   r'  QKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__	  s:   #=dkk]"!MJODLL>"DDJt22H== $ 3 3 : : @ @ F"l2
 #At{{Q 7!#e*dkkAo>!&u!2A KK!O3 *QvehZr.J J
#*4+>+>#I#I$.(/*-1D1D1K1Ka1O*P)Q*/)0%&J $J !+QvehZr.J J
 "3 &%rV   )r  )r  r  r{   r|   r}   )r   r   r   r   r   r'  r   rd  re  s   @rT   DetailedTemplateSyntaxErrorr  	  s    9& &rV   r  )r  r  r  filtersr  r  from_string)r  envr  er  s        rT   _template_from_string$KernelTemplate._template_from_stringt	  sj    l;-;-O-O)*.#	8??6**" !	8&.A &> .a0a7C!	8s   A A#
AA#c                0  ^^ [         R                  R                  m[        U [        [
        45      (       a0  U  Vs0 s H!  oR                  5       UR                  5       _M#     snmO U R                  5       U R                  5       0mSUU4S jjnU$ s  snf )Nc                @   > TR                  U 5      nUb  U$ T" U 5      $ rn   )r  )rf   r-  _get_dtype_reallookups     rT   r   1KernelTemplate._fake_get_dtype.<locals>.get_dtype	  s'    ZZ%F!"4((rV   )rf   r~   r{   r   )r7   r   r   rd  ro  rp  r   )	fake_outsrN  r   r  r  s      @@rT   _fake_get_dtypeKernelTemplate._fake_get_dtype	  s|     ''++i$//AJK#llncmmo5KF((*I,?,?,ABF	) 	)  Ls   (BNc                    Xl         X l        g rn   )rf   _hash)ri   rf   r  s      rT   r   KernelTemplate.__init__	  s    	
rV   c                    U R                   $ )z
entry point to override for templates to ensure a uid e.g. through a prefix

the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
in the system, but reproducible e.g. restarting pytorch should yield the same id
r  ro   s    rT   uidKernelTemplate.uid	  s     yyrV   c                    U R                   $ )z
source hash for a Template.

Templates can optionally provide a src hash to make it easier to cache/validate that
a template has not changed from one version to another. Override this if that detection
is different for your specific Template
)r  ro   s    rT   src_hashKernelTemplate.src_hash	  s     zzrV   c                \    / nU R                   " U40 UD6nUc  [        U5      S:X  a  US   $ g)z
Maybe generates a new ChoiceCaller and returns it, or None if generation fails.

kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
NrH   r   )maybe_append_choicer  )ri   r  temp_choicesr-  s       rT   choice_or_noneKernelTemplate.choice_or_none	  s<     #%)),A&A>c,/14?"rV   c                     UR                  U R                  " S0 UD65        g! [         aN  n[        R	                  SU[        U 5      [        R                  5       [        R                  :  S9  Us SnA$ SnAff = f)z
Maybe generates a new ChoiceCaller and appends it into existing choices.
Returns None if success, otherwise returns the error.

choices: A list of ChoiceCallers.
kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_infor   )	rK  r  r   r  inforz  getEffectiveLevelrP   INFO)ri   choicesr  r  s       rT   r  "KernelTemplate.maybe_append_choice	  sl    
	NN4==2623" 	HHET
002W\\A	   H	s   !$ 
A<AA71A<7A<c                    [         e)z=
Generates a ChoiceCaller instance from the given arguments.
r&  )ri   r  s     rT   r  KernelTemplate.generate	  s
    
 "!rV   )r  rf   )   )r  r~   r  r  r  r  r{   r~   )r  r~   r{   r	   )r  zUnion[list[Buffer], Buffer]r{   zCallable[[str], torch.dtype]rn   )rf   r~   r  r	  r{   r|   r}   )r{   zUnion[str, None])r  r	   r{   zOptional[ChoiceCaller])r  r   r  r	   r{   zOptional[NotImplementedError])r  r	   r{   r?   )r   r   r   r   r   r   r  r  r  r   r   r  r  r  r  r  r   r   rV   rT   r  r  b	  s     >?"%8;	  *8 *8X .	% "    
 ,/	&."rV   r  c                  d  ^  \ rS rSrSrS rSU 4S jjrSS jrSS jr  S         SS jjr	          SS jr
SS jrSS	 jr S         SS
 jjrSS jrSS jrSS jr          S S jr        S!S jr          S"S jr  S#               S$S jjrSrU =r$ )%rU  i	  zvA ops handler that proxies calls to `kernel` and its
handler and returns `CSEVariable`s with correct shape and dtype.
c                ^   > [         TU ]  5         SSKJn  U" 5       U l        Xl        X l        g )Nr   ValueRangeAnalysis)rV  r   r  r  vr_analysisr  parent_handler)ri   r  r  r  rX  s       rT   r   CSEProxy.__init__	  s'    /-/,rV   c           	     z  ^^^^
^^^^ U R                   " T/TQ70 TD6m[        U R                  T5      " T0 TD6n[        5       n[	        5       n[        5       m
[        UT5      nS mS mTS:X  a  T
S:X  a  UR                  mUR                  mO}TS:X  aU  T
S:X  aO  [        R                  R                  R                  R                  [        R                  S 5      R                  mS mO"T
S;   a  [        UT5      nU" T0 TD6mU" T0 TD6mT
S;   a  Tc   eSmSUU
UUUUUU4S jjn	[        R                   " X5      $ )	Nmaskedr  r  )r  r  r  )r  r  r   c                  > [        T	[        [        45      (       a  T	T
   OT	n[        T[        [        45      (       a2  [        T5      S:  a#  [        TS   [        [        45      (       a  TT
   OTnT
S-  m
[        U [        5      (       a,  TS:X  a  U R
                  c  Xl        U R                  c  X l        [        R                  R                  R                  [        R                  R                  U TT	TS9nUR                  TTT5        [        R                  R                  (       d  [        R                  R                   (       a)  Uc   e[#        [        R                  R                  X15        [        R                  R$                  (       a*  Tc   e['        [        R                  R                  UT5        [        R(                  (       a$  [+        [        R                  R                  U5        U$ )Nr   rH   r  r  r   r  )rd  ro  rp  r  r  r   r  r7   r  r  r  r  r  r   r  r  r  r  r   r  runtime_triton_nan_assertsr  )r  	var_dtype	var_shapecsevarrt   r  r  r  rf   r9  
output_idxoutput_shapes       rT   do_cse!CSEProxy._default.<locals>.do_cse(
  s   
 lT5M:: Z(!  lT5M::%)|Au>> Z( "  !OJ ![))e#'G77?'GXX\\**  "" + F !!$f5 ##??&&>> ,,,AHH,,f@"">>#///AHH,,flC00!((**F3MrV   )r  zUnion[str, CSEVariable]r{   r  )_bound_variablerH  r  r!   r$   r(   r   r  r7   interpreterr  r  r  r  r  pytreetree_map)ri   rf   rt   r  rh   dtype_handlershape_handlershape_opdtype_opr  r  r  r9  r  r  s    ```      @@@@@rT   _defaultCSEProxy._default
  s:   %%d<T<V<++T2DCFC2424%'=$/88 3 ;;L ;;LX'U"2==55::>>#''e   L00}d3H#T4V4L#T4V4L''+++
0	 0	d v--rV   c                >  ^	 SSK Jn  SSKJn  SSKJn  [        [        R                  U5      (       a  [        R                  " 5       $ [        [        R                  U5      (       a  [        R                  " 5       $ [        [        R                  [        5      (       a  [        R                  " 5       $ [        R                  R                  m	T	R                  U:X  a  U R                  R                  b  [        U R                  R                  [         5      (       d$   [#        U R                  R                  5      5       eU R                  R                  R%                  T	[        R                  " 5       5      $ [&        R(                  (       az  [+        XA5      (       aj  [-        U	4S jS 5       5      (       a  [        R                  " 5       $ U(       a   eS	S jn[/        [1        Xr5      5      n[3        U R4                  U5      " U6 $ [        R                  " 5       $ )
z
If the variable comes from an FX node, we forward the bound we have already computed
Else, if the variable when codegen'ing another op, we try to compute its bounds
r   r  )TritonTemplateKernelrH   )CUDATemplateKernelc              3  @   >#    U  H  oTR                   ;   v   M     g 7frn   )r,  )r  rm  fx_nodes     rT   r  +CSEProxy._bound_variable.<locals>.<genexpr>w
  s     V0U1&0Us   )set_indirectr  r  c                    [        U [        5      (       a  U R                  $ [        U [        R                  5      (       a  [        U 5      $ U $ rn   )rd  r  r  r   rT  r   ro  s    rT   arg_to_bound.CSEProxy._bound_variable.<locals>.arg_to_bound
  s8    a--88O5::..&q>)HrV   )rp  r	   r{   r	   )r  r  select_algorithmr  cuda.cuda_kernelr  rd  r7   r  r   r  r  r1   r  r,  r  dictrz  r  r   compute_all_boundsrU  r  ro  r9  rH  r  )
ri   rf   rt   r  r  r  r  r  
arg_boundsr  s
            @rT   r  CSEProxy._bound_variable\
  sy   
 	0;8ahh 455&&((ahh 233&&((amm[11&&((--,,>>T!dkk&@&@&Ldkk88$?? **B ? ;;--11';;N;N;PQQ&&73E+L+L V0UVVV"**,, : c,56J4++T2J??""$$rV   c                   [        U[        5      (       a  [        R                  " U5      n[        U[        R                  5      (       d   [        U5      U45       eUR                  R                  S:  Ga  U(       a  [        R                  " U[        R                  " U[        R                  5      5      nUR                  R                  S:  a.  [        R                  " US5      n[        R                  " XeU5      nOUn[         R"                  " 5       nUR                  [         R"                  " 5       :w  a  [        U[        R$                  5      (       a  UR                  [!        [&        * S5      -  n[!        UR                  U-   UR                  U-   5      nUR                  R                  S:  a!  UR                  [!        S[&        5      -  n	Xy-  nU R(                  R*                  R-                  U R(                  R.                  UUUR0                  UR2                  S9nU R4                  R7                  XU5      n
[9        U5      (       av  UR                  R                  S:  + n[        U[        R$                  5      (       + =(       d    UR                  R                  U:  + nU R(                  R;                  XX5        U
$ )Nr   r  r  )rd  r  r   r  rT  rz  r  r  r2   r  r  r   longr  ltr  r   r  Numberr   r  r  r  r  r   r  r  r  r'   r  )ri   r  r   r  r  stmr  
new_bounds
neg_boundspos	sympy_varassert_lowerassert_uppers                rT   r  CSEProxy.indirect_indexing
  s    dC  ==&D$

++?d4j$-??+ ::aggc3>>$

#CD::##q(QB))BS1C %,,.Jzz[0022z$7U7U !ZZ+vgr*BB
($$t+Z-=-=-D
 ::##q(**{1f'==C!+!1J++//**##!iiii + C ''99#UK	5!! #

 0 0A6L)$== 

  4BL KK$$YlQrV   c                :    U R                   R                  XX45      $ rn   )r  r  r  s        rT   r  CSEProxy.check_bounds
  s     {{''EAArV   c                   XR                   R                  R                  ;   a)  [        R                   R                  R                  U5        [        U[        R                  5      (       a  U R                   R                  X5      $ U R                   R                  R                  nX;   a  X1   $ U R                   R                  X5      nUR                  S:X  a  U R                   =R                  S-  sl        U$ rl  )r  r  r  r7   r  r  r   r   TMPr,  r  r  r  r  )ri   rf   r  r  outs        rT   r  CSEProxy.load
  s    ;;??555 HH&&**40udhh//;;,,T99kkoo11$$kkt+ ==AKK  A% 
rV   c                z   X R                   R                  R                  U'   U R                   R                  (       a~  U[        R
                  R                  ;   a_  U R                   R                  R                  U5      nUR                  5        H%  nX R                   R                  R                  U'   M'     g g g rn   )	r  r  r  r  r7   r   name_to_buffer
get_outputget_mutations)ri   rf   rh   rN  
other_names        rT   _update_store_cacheCSEProxy._update_store_cache
  s    ,1##D);;##0F0F(F++**55d;C!//1
:?++J7 2 )G#rV   c                &   U R                   R                  R                  U5        Uc  U R                  X5        U[        R
                  R                  ;  a:  U R                   R                  XX4S9  U R                   =R                  S-  sl        g g )N)r]   rH   )	r  r  r  r  r7   r   r  r  r  r  s        rT   r  CSEProxy.store
  sr     	&&**40<$$T1qww...KKd5<KK!!Q&! /rV   c                :    U R                   R                  X5        g rn   )r  r
  r	  s      rT   r
  CSEProxy.device_assert_async
  s    ''2rV   c                6    U R                   R                  " U6   g rn   )r  r:  rs   s     rT   r:  CSEProxy.partial_accumulate
  s    &&-rV   c                "   U R                   R                  R                  U5        U R                  X5        U[        R
                  R                  ;  a;  U R                   =R                  S-  sl        U R                   R                  XU5      $ g rl  )	r  r  r  r  r7   r   r  r  r  r  s       rT   r  CSEProxy.store_reduction
  sk    &&**40  -qww...KK!!Q&!;;..tEBB /rV   c                x    U R                   =R                  S-  sl        U R                   R                  XX45      $ rl  )r  r  r  r  s        rT   r  CSEProxy.reduction
  s0     	!!Q&!{{$$U~MMrV   c                :    U R                   R                  XU5      $ rn   )r  r  r  s       rT   r  CSEProxy.scan  s     {{F;;rV   c                :    U R                   R                  XX45      $ rn   )r  r   r  s        rT   r   CSEProxy.sort  s     {{CCrV   c           	     D    U R                   R                  UUUUUUU5      $ )a
  
[Note: Inductor bucketize op]

Inputs:
-------
values: the values to be bucketized.
boundaries: a tuple containing
  (a) the name of the boundaries tensor (which must be sorted, unless
  the sorting tensor is present),
  (b) the length of the tensor in the last dimension (i.e. the length of
  one set of boundaries),
  (c) the number of elements in the underlying storage (i.e. the length
  of the flattened tensor, ignoring striding), and
  (d) the stride of the tensor in the last dimension.
boundary_indices: indices into a flattened version of the boundaries
tensor, of the same size and shape as "values".  Each index points to
the first element in the set of boundaries to be used for the
corresponding value.
indexing_dtype: the dtype to use when indexing into the boundaries
tensor.  This must be int64 or int32.  This additionally specifies the
dtype of the return value.
right: see "Details" below.
sorter: an optional tuple containing
  (a) the name of an optional sorting tensor, used to access unsorted
  boundaries without reordering the boundaries tensor, and
  (b) the stride of the tensor in the last dimension.
The values in the sorting tensor are used as indices into the *last*
dimension of the boundaries tensor, with all other indices matching.
The size of the sorting and boundaries tensors must be equivalent.
sorter_indices: must be present if the sorting array is present; see
"boundary_indices" for the equivalent definition for the boundaries
tensor.

Output:
-------
The buckets each value belongs in, within a given set of boundaries.  0
indicates a position before the first boundary, and len(boundaries_set)
represents a position after the last boundary.

Details:
--------
Given a value and a set of boundaries, calculate the bucket that each
value belongs to.  This works differently in 1-D and N-D cases.

for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

Note that in the N-D boundaries case, the shape of "values" and
"boundaries" must match in every dimension _except_ the last.

When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

Boundaries must be non-decreasing, or a sorter must be provided which
would re-index offsets in a non-decreasing order (e.g. the second output
of torch.sort(offsets)).  Otherwise, the result is undefined.
)r  r*  r#  s           rT   r*  CSEProxy.bucketize  s1    L {{$$
 	
rV   )r  r  r  )r  zKernel[Any]r  zOpsHandler[Any])rf   r~   rt   ztuple[Any, ...]r  r  r{   r	   )rf   r~   rt   r	   r  r	   r{   r  rW  )
r  r  r   rX  r  r   r  r   r{   rY  rZ  r  )rf   r~   rh   r  r{   r|   rn   r  r[  )rt   r	   r{   r|   r  r  r  r  r_  r  )r   r   r   r   r   rf   r   r  r  r  r  r  r  r  r
  r:  r  r  r  r   r*  r   rd  re  s   @rT   rU  rU  	  s    D-S.j.%h 55 %5 	5
 5 
5nBB&0B9=BFJB	B
"@ SW'' *'3>'FO'	'3.CNN N &	N
 ;N 
5N	<'	<
	< (	< 
!	<D'D (D 	D
 D 
!D  4804N
N
 CN
 &	N

 $N
 N
 1N
 .N
 
N
 N
rV   rU  )rS   r~   r{   r|   )NNNN)r   r~   re  r  rf  r  rg  r  rh  r  ri  Optional[CustomGraphModulePass]rj  Optional[ConfigModule]r{   r|   )r   Union[torch.device, str, None]r{   zOrderedSet[BackendFeature])r   r/  r  rm  r{   r   )r   r~   r{   zOptional[SchedulingConstructor])FF)r   r~   r  r   r  r   r{   r  )r   r~   r{   r-  )r   r~   r{   r.  rz   )r  Sequence[sympy.Expr]r  r0  r  r0  r{   r   )r   r~   r  r$  r{   r|   )r   r~   r{   r$  )r  r~   rt   r	   r  r	   r{   r  )r  r)   r  r  r   r   r{   r|   )r  r)   r  r  r  rG   r{   r|   )r  r)   r  r  r{   r|   )r  r~   r{   r   r   rb  )r{   r	   )
__future__r   rb   r  dataclassesenumr"  r  rP   r  r4  rd   rer`   abcr   r   r   r   r   typingr	   r
   r   r   r   r   r   r   typing_extensionsr   r   r   r   torch.fxtorch._prims_commonr   torch.utilsr   r  torch.utils._config_moduler   torch.utils._ordered_setr   torch.utils._sympy.numbersr   torch.utils._sympy.printersr   _PythonPrintertorch.utils._sympy.symbolr   r   r   torch.utils._sympy.value_rangesr   r   r  r   r    dtype_propagationr!   ops_handlerr"   r#   shape_propagationr$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   r5   r6   r7   collections.abcr8   r9   r:   r;   r<   custom_graph_passr=   r   r>   r?   r@   rA   rH  rC   rI  rD   rE   rF   rG   r  rJ   rK   r  rz  r  r~   rj  r  r  _logginggetArtifactLoggerr   rN   	getLoggerr  rU   	dataclassrX   r   r   r   r   r  r  r  r  r  KernelArgTyper"  r   r$  r`  ra  rb  rk  rm  r|  r  r{  r  r  r  cachery  r  r  r  bfloat16r  float16r   ry  float64int8int16rn  r  r   uint16r]  uint64r  r  r  r  r  r	  rg  compile
IGNORECASEr  r  r  rd  r   INT_TO_FLOATrO  r  r   r  r  r  r$  r&  r*  r  r  r  rp  r   ReductionCacheKeyr  r  r  r  r  r  rU  r   s   0rT   <module>rZ     s1   "          	 	  #  	 	 	 ,    ? ) 3 / - G O O D  : ; :      LL$9>>$DD2-	B$hy&9%:N%JK23sELL()J F~~//*E!=
   >/		 /(C  Td= d dN* * # # #       ! ! ! < < < lIw8H,VW,.) .5" 5"p :< 6 ;DF A FDF  A F8 @D>B:>37BB,B /B !=	B
  <B 8B 1B 
B4
&T 
&3*33$3*35C3	3U
 @E"8<!-6 c cLUU$U  U 	U;;&7;	;	, 
NNEKK	MM5;;> JJMMMMJJKKKKKKKKLLLLLL

E 	
> : ,''' ' 	'T::!0:9D:	:2	L	L!0	L9G	L		L
aB aBH=N =$S1 S1l zz";2==Q  O;#%5z# O;d - - -  6: `6;HH-`6 ;HH/- 	`6 ;HH/- 	`6$ ;HH/- 	%`60 ;HH/- 	1`6< ;HH*)	=`6L ;HH(0	M`6X 	;HH66>	Y`6h ;HH1i`6r ;HH2s`6| ;HH1}`6F ;HH2 G`6P ;HH%8$Q`6^ 	;HH&%		_`6j ;HH%8	k`6v 	;HH&	w`6@ ;HH+A`6L %;HH88)	M`6X %;HH88)	Y`6d %;HH8)e`6n %;HH8)o`6z 
;HH'
{`6D ;HH(E`6N ;HHc	O`6^ ,;HH?0_`6h ,;HH?0i`6t &;HH9*u`6~ 
;HH*
`6H );HHD-I`6R );HHD-S`6\ );HHD-]`6f );HHD-g`6p (;HHC,q`6z $1;HHL5${`6D $1;HHL5$E`6N $1;HHL5$O`6X $1;HHL5$Y`6b ';HHB+c`6l (;HHC,m`6v (;HHC,w`6 2 `F	-# -"> *Z 
 N N N 
 ,X Xv
#; #;L 5+;Tk5c!1223	5l'/=0
1 l^
< 
<p3Wgo. p3f	     S" S"lm
~ m
};s   /d 