
    ȅi                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKJr  S SKJrJrJr  S SKJr  S SKJrJrJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJ s  J!r"  S SK#J$r$  S S	K%J&r&J'r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-J.r.J/r/  S SK0J1r1J2r2J3r3  SSK4J5r5J6r6J7r7J8r8  SSK9J:r:  SSK;J<r<J=r=J>r>J r   SSK?J@r@  SSKAJBrBJCrCJDrDJErE  SSKFJGrG  SSKHJIrI  SSKJJKrK  SSKLJMrM  SSKNJOrOJPrPJQrQJRrRJSrS  SSKTJUrUJVrV  SSKWJXrXJYrYJZrZJ[r[  SSK\J]r]  SSK J^r^J_r_J`r`JaraJbrbJcrcJdrdJereJfrfJgrgJhrhJiriJjrjJkrk  SSKlJmrnJoroJprpJqrq  SSKrJsrs  S S!KtJuru  S S"KvJwrwJxrxJyryJzrzJ{r{J|r|J}r}J~r~JrJrJrJrJrJrJrJr  S S#KJrJrJrJrJrJrJr  S S$KJrJrJrJrJr  S S%KJr  \(       a,  S S&KJr  S S'KJr  S S(KJr  SS)K=Jr  S S*KvJr  S S+KJr  \" S,5      r\GRD                  " \5      r\GRJ                  GRM                  \S-5      r\GRJ                  GRM                  \S.5      r\GRJ                  GRM                  \S/5      r\@" 5       r?S0rS1 rSdS2 jr " S3 S45      r\" S5      SeS5 j5       r\" S5      SeS6 j5       r " S7 S85      r\GRb                   " S9 S:5      5       r\GRb                   " S; S<5      5       r\GRb                   " S= S>\5      5       r\GRb                   " S? S@\5      5       r        SfSA jr " SB SC\5      r\" 5       GRp                  rSgSD jrSgSE jrShSF jrSgSG jrSiSH jrSjSI jr " SJ SK\{5      rSkSL jrSlSmSM jjr " SN SO\5      r\GR                  SP5         " SQ SR\5      r " SS ST5      r\GRb                   " SU SV5      5       r " SW SX5      r\GRb                   " SY SZ5      5       r " S[ S\\z\\\\\\4   4   4   5      r\GRb                   " S] S^5      5       r " S_ S`\\   5      r " Sa Sb\5      rSnSc jrg)o    )annotationsN)abstractmethod)CallableIterableSequence)	lru_cache)AnycastOptionalTYPE_CHECKINGTypeVarUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)get_triton_versionhas_triton_packagehas_triton_stable_tma_api   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetricsutils)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)'set_kernel_post_grad_provenance_tracing)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesReductionHintTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)get_broadcasted_shape)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferis_buffer_removedOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRootPartialAccumulate
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleTypeDtypePropagationOpsHandler)ShapeEnv)IRNode)BlockShapeType)SIMDKernelFeatures_T
perf_hintsschedulefusion   c                L    U S;   nU(       a  SOSnU S;   a  U SU  S3$ U SU  3$ )N)anymaxminprodtriton_helperstl)r{   r|   .2 )reduction_type
use_helpermodules      X/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/triton.pyget_triton_reduction_functionr      sE    #@@J!+F'>*!,,>*++    c                    [        U [        R                  5      (       d  g[        U [        R                  5      =(       d+    U R                  =(       a    [        U R                  5      S:H  $ )z"
Is this expression a Sympy Integer or is it an integer sympy Expr
containing no free symbols. The latter case can happen with Identity expr.
Fr   )
isinstancesympyExprInteger
is_integerlenfree_symbolsexprs    r   is_sympy_integer_liker      sL    
 dEJJ''dEMM* 7C 1 12a7r   c                  J    \ rS rSr% Sr0 rS\S'   0 rS\S'   \S
S j5       r	Sr
g	)OpDtypeSupport   z
Some Triton ops such as libdevice and tl.math only support float32 and float64.
This class records which dtypes are supported by specific IR ops.
z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    UR                   n[        [        R                  [        R                  /5      U R
                  U'   X R                  U'   g N)__name__r   torchfloat32float64r   r   )clsfuncconvert_outputop_names       r   register_upcastOpDtypeSupport.register_upcast   s;    --(2EMM5==3Q(RW%'5G$r   r   N)r   zCallable[..., str]r   boolreturnNone)r   
__module____qualname____firstlineno____doc__r   __annotations__r   classmethodr   __static_attributes__r   r   r   r   r      s1    
 <>8=')O_)6 6r   r   c                 x    [        5       (       d  gSSKn [        U R                  R                  S5      (       a  gg)zX
import AttrsDescriptor if the triton version is new enough to have this
class defined.
 r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r   gen_attr_descriptor_importr      s3     # v''):;;Er   c                 j   [        5       n U R                  S5         SS KnU R                  S5        [	        5       =n(       a  U R                  U5        U R                  S5        [        R                  R                  (       a  U R                  S5        U R                  5       $ ! [         a     Nf = f)NzD
        import triton
        import triton.language as tl
        r   zM
           import triton.language.extra.tlx as tlx  # noqa: F401
           a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        z
            import triton.profiler as proton
            import triton.profiler.language as pl
            pl.enable_semantic('triton')
            )
rV   splicetriton.language.extra.tlxImportErrorr   	writeliner"   r   proton_profilinggetvalue)importsr   	attr_descs      r   gen_common_triton_importsr      s    GNN		(	
 /00y0)$NN	 }}%%	
 +  s   B% %
B21B2c                     \ rS rSrSr\" \R                  \R                  /5      r	\" \R                  \R                  \R                  /\	Q5      r\ V VVVs0 s H$  nU[        R                  " [         U    S3SSS9_M&     snnnn r\ V VVVs0 s H2  nU[        R                  " [         U   R%                  5        S3SSS9_M4     snnnn r\SS j5       r\SS	 j5       r\SS
 j5       rSrgs  snnnn f s  snnnn f )TritonSymbols   zM
Stores sympy.Symbol instances and constants associated with triton codegen.
offsetTintegernonnegativeBLOCKr   positivec           
     "   SnUR                   nU GH  n[        U[        R                  5      (       a?  [        R
                  R                  R                  UR                     nUR                  nGO}[        U[        R                  [        R                  [        R                  [        R                  [        R                  [        R                  45      (       a  SnGOU R                    Vs/ s H  n[        XG5      (       d  M  UPM     nn[#        U5      S:X  d   SUR                   35       eUS   n	[        R
                  R%                  5       n
S/U
-  n[        R
                  R'                  5        Vs/ s H  n[(        U	   UR*                  :X  d  M  UPM      nn[#        U5      S:X  d   S5       e[-        U R/                  US   5      5      XS   R0                  '   [3        U5      n[5        X&5      nGM     Uc   eU$ s  snf s  snf )Nr   rN   Ambiguous type: r   1z# of Match expected to 1)r   r   r   TMPrL   kernelcsevarname_mapnameshapeUNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATblock_typesr   triton_tensor_ndimactive_range_treesr   prefixstrget_block_size
tensor_dimtupler:   )r   r   
expr_shape	expr_varsvarcse_var	var_shapesymtsymbol_matchessymndimr   tree
tree_matchs                 r   get_block_shapeTritonSymbols.get_block_shape   s    &(
%%	Cc488,,((,,22388<#MM	%%II))JJJJ''
 
 	 &)__"%4Ts8QD_  " >*a/N3CCHH:1NN/$Q'xx224 !" ; ; = =!#$++5  =  
 :!+G-GG+25c6H6HTU6W2Xm../!%L	 /zEJK N %%%1"s   0HH<HHc                4    U R                   UR                     $ r   )block_sizesr   r   r   s     r   r   TritonSymbols.get_block_size2  s    tyy))r   c                4    U R                   UR                     $ r   )block_offsetsr   r   s     r   get_block_offsetTritonSymbols.get_block_offset6  s      ++r   r   N)r   
sympy.Exprr   rr   )r   ra   r   zsympy.Symbol)r   r   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKr   r   Symbolr   r   upperr   r   r   r   r   r   ).0r   r   r   s   0000r   r   r      s    !$--!?@Odkk4;;VoVWK  D 	ellj./v6RVWWM  	  D 	ell$%%'(.t
 	
  	K 4 4l * * , ,Q
s   +C1
9C9
r   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   SS jrSS jrSS jrSS jrSS jr	\
SS j5       rSrg)IndexingOptionsi;  r   	index_strOrderedSet[str]	mask_varszOptional[str]
expand_strr   _has_rindexr   indexz#Optional[Sequence[Union[int, str]]]expand_shapec                ,    [        U R                  5      $ r   )r   r  selfs    r   has_maskIndexingOptions.has_maskD  s    DNN##r   c                J    [        U R                  [        R                  5      $ r   )r   r  r   r   r  s    r   has_indirectIndexingOptions.has_indirectG  s    "4::txx88r   c                    U R                   $ r   )r  r  s    r   
has_rindexIndexingOptions.has_rindexJ  s    r   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)tmpNr   
startswithr   masks     r   	<genexpr>.IndexingOptions.has_tmpmask.<locals>.<genexpr>N  s"     J>43t9''..>   ')rz   r  r  s    r   has_tmpmaskIndexingOptions.has_tmpmaskM  s    J4>>JJJr   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)rNr  r  s     r   r  ,IndexingOptions.has_rmask.<locals>.<genexpr>Q  s"     H3t9'',,r  r  r  s    r   	has_rmaskIndexingOptions.has_rmaskP  s    HHHHr   c                    U R                   (       a2  SR                  [        [        [        U R                   5      5      5      $ S$ )N & r   )r  joinsortedmapr   r  s    r   mask_strIndexingOptions.mask_strS  s4     =ANNEJJvc#t~~678	
PV	
r   r   Nr   r   r   r   )r   r   r   r   r   r  r  r  r   r&  propertyr-  r   r   r   r   r  r  ;  sT    N55$9 KI 
 
r   r  c                     \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S\S'   S\S'   S\S'   S\S'   SrS\S'   SrS\S'   \S)S j5       r	\S)S j5       r
\S)S j5       r\S)S j5       r\SS.               S*S jj5       r        S+S jrS,S jr      S-S  jrS.S! jrS/S" jrS/S# jrS/S$ jrS/S% jrS/S& jr            S0S' jrS(rg)1BlockDescriptorOptionsi]  z
This is a base class that describes a block descriptor used in Triton kernels.
It can be used to create either a tensor descriptor (with TensorDescriptorOptions)
or a block pointer (with BlockPtrOptions).
BlockParametersparamsr   constant_offset	list[int]orderr  r  Sequence[sympy.Expr]broadcast_shape
list[bool]broadcasting_dimsfinal_shapeBlockParameters.StrideSorterstride_sorterNzOptional[list[int]]_boundary_checkFr   can_liftc                .    U R                   R                  $ r   )r5  r   r  s    r   r   BlockDescriptorOptions.shapeu  s    {{   r   c                .    U R                   R                  $ r   )r5  block_shaper  s    r   rE  "BlockDescriptorOptions.block_shapey  s    {{&&&r   c                .    U R                   R                  $ r   )r5  stridesr  s    r   rH  BlockDescriptorOptions.strides}      {{"""r   c                .    U R                   R                  $ r   )r5  offsetsr  s    r   rL  BlockDescriptorOptions.offsets  rJ  r   )rA  c               ,  ^ [         R                  R                  mS	U4S jjnU" UR                  5      Ul        U" UR                  5      Ul        UR
                   V	s/ s H  n	TR                  U	S5      PM     n
n	[        U
5      (       a  SU
S'   UR                  U
5      nUR                  U[         R                  R                  S9u  pUR                   Vs/ s H  nTR                  US5      PM     nnUR
                  nUR                  U5      nU Vs/ s H  n[        R                  U5      PM     nn[         R                  R                  (       a&  US   R                  S:X  d   eUR!                  S5        [         R                  R"                  n[         R                  R$                  (       d  ['        UR                  5      ['        [         R                  R(                  5      U-
  :X  aN  [         R                  R*                  R-                  5       (       a!  U[.        R0                  R2                  /U-  -  n [4        R6                  " [         R                  R                  UR                  5      nU " U[         R                  R                  RA                  U5      UUUUUUUS9	nURC                  XS5        U$ s  sn	f s  snf s  snf ! [8         a3    [;        [=        [?        ['        UR                  5      5      5      5      n Nf = f)
z2Helper to create a BlockDescriptorOptions instancec                R   > U  Vs/ s H  nTR                  U5      PM     sn$ s  snf r   )lookup_precomputed_size)exprsr   sizevarss     r   lookup_size2BlockDescriptorOptions.create.<locals>.lookup_size  s&    GLMutH44T:uMMMs   $rN   F)stride_sorter_cls	shape_envr   x)	r5  r6  r8  r  r=  r:  r<  r?  rA  )rQ  zIterable[sympy.Expr]r   list[sympy.Expr])"rL   graphrR  r   rH  rE  statically_known_equalsallremove_dimsmaybe_sort_with_stride_order
_shape_envr   r   r   no_x_dimr   popnum_reduction_dimsinside_reductionr   numelsfeaturesis_reductionr   SOner%   argsort_symAssertionErrorlistreversedrangerP  compute_boundary_check)r   r5  r6  range_treesr  get_max_blockrV  rA  rS  dimsingleton_dimsr?  strider<  r:  r   r=  reduction_ndimr8  resultrR  s                       @r   createBlockDescriptorOptions.create  s    77##	N #6<<0$V^^4 AG@R@R
@RH,,S!4@R 	 
 ~!&N2 ##N3 !' C C/177;M;M !D !
 GMnn
FTFH,,VQ7n 	 
 !,, ##$56 GRRkd}33D9kR88q>((C///OOA44))FNN#s188??';n'LL!!..00 EGGKK=>99K	? %%agg&8&8&..IE
 GG,,DD_U#+/'

 	%%mAC
$
 S$  	?%FNN(;"<=>E	?s$    KKK9K :LLc                B    [         R                  U   n[        XU05      $ z>
Replaces instances of {symt}_offset with the new expression.
r   r   rE   r  r   replacementr   roffsets        r   replace_offset%BlockDescriptorOptions.replace_offset  $      --d3$+ 677r   c                    [         R                   H*  nU R                  U[        R                  " S5      U5      nM,     U$ Nr   r   r   r~  r   r   r  r   r   s      r   remove_roffsets&BlockDescriptorOptions.remove_roffsets  4    !11D&&tU]]1-=tDD 2r   c           
     &   [         R                  R                  nU Vs0 s H8  n[        R                  UR
                     U" [        UR
                     5      _M:     nn[        [        [         R                  R                  U5      5      n[        [        U R                  5      5       Vs/ s GHK  nUR                  U R                  U   [         R"                  R$                  5      (       a  MB  U(       a<  [        R                  [&        R(                     U R*                  U   R,                  ;   dp  UR/                  U R                  U   U R*                  U   5      (       a  M  UR/                  U R                  U   [1        U R*                  U   U5      5      (       a  M  [         R                  R2                  (       a5  U R*                  U   [        R                  [&        R4                     :X  a  GMI  UPGMN     snU l        gs  snf s  snf )z6List of indices to pass to tl.load(boundary_check=...)N)rL   rZ  rR  r   r   r   r   rz   r,  r   needs_yz_grid_overflowrm  r   r   r[  rH  r   rg  Zeror   r   rE  r   statically_known_multiple_ofrE   r`  r   r@  )r  rp  ro  rR  tblock_to_maxneeds_overflow_grididxs           r   rn  -BlockDescriptorOptions.compute_boundary_check  s    77## !/
  %%aff-}Z=O/PP  	 /
 "#ahh&E&E{"ST S_- 
-44T\\#5FU  ,)55dkkB++C0==> %AA JJsOT-=-=c-B  !) E E JJsO&t'7'7'<lK! * HH%%((-1J1J4;;1WW- C- 
/
 
s&   ?H	-AH2A2H(9H%AH9Hc                8    U R                   c   eU R                   $ r   r@  r  s    r   boundary_check%BlockDescriptorOptions.boundary_check%  s     ##///###r   c                    gNFr   r  s    r   r  #BlockDescriptorOptions.has_indirect)      r   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   )r   r   r   )r   r   s     r   r  4BlockDescriptorOptions.has_rindex.<locals>.<genexpr>-  s'      
(  m&C&CDD(r  )rz   rE  r  s    r   r  !BlockDescriptorOptions.has_rindex,  s"     
((
 
 	
r   c                "    U R                  5       $ r   )r  r  s    r   r&   BlockDescriptorOptions.has_rmask2  s      r   c                    gr  r   r  s    r   r   "BlockDescriptorOptions.has_tmpmask5  r  r   c                4    [        U R                  5       5      $ r   )r   r  r  s    r   r  BlockDescriptorOptions.has_mask8  s    D'')**r   c                  ^ U R                   nU R                  nU(       aJ  U R                  R                  U R                   5      nU R                  R                  U R                  5      n[	        Xg5       VV	s/ s H(  u  pU	(       a  [
        R                  R                  OUPM*     n
nn	[        XU
5      nU R                  R                  (       d:  U(       d3  [        U
5      [        U5      :X  a  U R                  R                  U
5      n
[        R                  R                  mU=(       a<    [        U
5      [        U5      :H  =(       a    [        U4S j[	        X5       5       5      n[        U R                  5      (       a-  U(       d&  SU S[        R                   R#                  U5       S3nU R                   nU R                  R                  (       dt  U(       a  U R                  R$                  OU R                  R&                  nSU SU S3nU(       a  U R                   O$U R                  R                  U R                   5      n[        XU5      nU$ s  sn	nf )a>  
Generate a broadcast and a reshape for the block descriptor.
This restores stride-0 dimensions which were removed from the block descriptor.

Transposes are also applied to the input using self.stride_sorter:
if for_store is True:
    - First Broadcast the value. Since self.broadcast_shape is stored in
    descending stride order, it must be reverted to the original order
    since the input value does not have dims with descending strides
    - After, transpose the broadcasted value so that dimensions are in
    descending stride order
    - Finally reshape to the block shape
else (for load):
    - First broadcast the value to self.broadcast_shape (strides are descending)
    - Then transpose the value so that dimensions no longer have descending strides
    - Finally reshape the block to the final kernel tile shape
c              3  |   >#    U  H1  u  pTR                  US 5      =(       d    TR                  X5      v   M3     g7frN   N)r[  )r   pre_dimpost_dimrR  s      r   r  GBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>y  sE       *O%G 00!< G33GFG)Ns   9<tl.broadcast_to(, )	tl.trans()r:  r<  r?  revertzipr   rg  rh  triton_reshapeis_identityr   rL   rZ  rR  r\  rz   r   index_to_strsort_idxrevert_sort_idx)r  valueinitial_shaper=  allow_implicit	for_storer:  r<  rq  is_broadcastingpre_broadcast_shapesupports_implicit_broadcast	old_shapepermute_dimsrR  s                 @r   codegen_broadcast_and_reshape4BlockDescriptorOptions.codegen_broadcast_and_reshape;  s   2 .. 22 "00778L8LMO $ 2 2 9 9$:P:P Q
 ),O(O
(O$ +EGGKK3(O 	 
 u5HI ""..'(C,<< #'"4"4";";<O"P 77##&4 '
#$K(88   *--@)N  	$ t%%&&/J"5'AHH,A,A/,R+SSTU  ((	!!--  ""++''77 
  wba8E  $$''..t/C/CD  u=w
s   9/Ir  r   rY  )r5  r4  r6  r   ro  list[IterationRangesRoot]r  r  rp  Callable[[str], int]rV  z"type[BlockParameters.StrideSorter]rA  r   r   r3  r   r   r|  r   r   r   r   r   r   r   r   r   )rp  r  ro  r  r   r   )r   r7  r/  )r  r   r  r9  r=  r9  r  r   r  r   r   r   )r   r   r   r   r   r   r@  rA  r1  r   rE  rH  rL  r   rv  r~  r  rn  r  r  r  r&  r   r  r  r   r   r   r   r3  r3  ]  s    ))!!%% 0/+/O(/ Hd! ! ' ' # # # #  [  [ $	[
 /[ #[ ,[ >[ [ 
 [ [z88-78?C8	8
2
+2
 /2
 
	2
h$
!+aa ,a *	a
 a a 
ar   r3  c                  "    \ rS rSrSSS jjrSrg)TensorDescriptorOptionsi  c                .   [         R                  R                  nU R                  S:w  a  U SU" U R                  5       S3OUSU" U R                  5       3SU" U R
                  5       3SU" U R                  5       3/nSSR                  U5       S3$ )	z
Codegen a call to tl.make_tensor_descriptor()

Args:
    name: variable name for pointer
    roffset: unused, but kept for compatibility with BlockPtrOptions.format()

Returns:
    "tl.make_tensor_descriptor(...)"
r    + (r  shape=strides=block_shape=ztl.make_tensor_descriptor(r  )rL   r   r  r6  r   rH  rE  r*  )r  r   r}  fargss        r   formatTensorDescriptorOptions.format  s     HH!! ''1, &Qt3345Q7Qtzz]O$q'(1T--./0	
 ,DIIdO+<A>>r   r   NTr   r   r   r   )r   r   r   r   r  r   r   r   r   r  r    s    ? ?r   r  c                  P    \ rS rSr        SS jrS	S jrS
SS jjrSS jrSrg)BlockPtrOptionsi  c                B    [         R                  U   n[        XU05      $ ry  rz  r{  s        r   r~  BlockPtrOptions.replace_offset  r  r   c                    [         R                   H*  nU R                  U[        R                  " S5      U5      nM,     U$ r  r  r  s      r   r  BlockPtrOptions.remove_roffsets  r  r   c           	        [         R                  R                  n/ U R                  QnU(       d   U Vs/ s H  oPR	                  U5      PM     nnU R
                  S:w  a  U SU" U R
                  5       S3OUSU" U R                  5       3SU" U R                  5       3SU" U R                  5       3SU" U R                  5       3SU" U5       3/nS	S
R                  U5       S3$ s  snf )z
Codegen a call to tl.make_block_ptr()

Args:
    name: variable name for pointer
    roffset: should rn_offset be included in offsets=..., for use with tl.advance()

Returns:
    "tl.make_block_ptr(...)"
r   r  r  r  r  r  zorder=zoffsets=ztl.make_block_ptr(r  )rL   r   r  rL  r  r6  r   rH  rE  r8  r*  )r  r   r}  r  rL  r   r  s          r   r  BlockPtrOptions.format  s     HH!!!DLL/BIJ'++F3'GJ ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Ks   C*c           	         [         R                  U   nU R                   Vs/ s HA  nU R                  X2U5      U R                  U[        R
                  R                  U5      -
  PMC     nnU$ s  snf )aF  
Codegen string to pass to tl.advance(name, ...).

Advance is the difference between offsets in each loop iteration.
To compute it, we replace rN_offset with multiples of RN_BLOCK.
Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
)r   r   rL  r~  r   rg  r  )r  r   rblockr   advances        r   advance_roffsetBlockPtrOptions.advance_roffset  sw     **40 ,,

 ' ##FD9%%feggllDAB ' 	 
 
s   AA.r   Nr  r  r  r  )r   r   r   r   )	r   r   r   r   r~  r  r  r  r   r   r   r   r  r    s6    88-78?C8	8
7:r   r  c                   [        U[        5      (       a  [        U[        5      (       d   eU Vs/ s H"  n[        R                  R	                  U5      PM$     nnU Vs/ s H"  n[        R                  R	                  U5      PM$     nnXE:X  a  U $ U Vs/ s H  ofS:w  d  M
  UPM     snU:w  a  SU  SSR                  U5       S3$ Sn/ nU HK  n	U[        U5      :  a   XU   :X  a  UR                  S5        US-  nM2  U	S:X  d   eUR                  S	5        MM     U[        U5      :X  d   eU  S
SR                  U5       S3$ s  snf s  snf s  snf )z<Workaround https://github.com/triton-lang/triton/issues/2836r   ztl.reshape(z, [r  z])r   :rN   r   [])r   rk  rL   r   r  r*  r   append)
r  r  	new_shaper   old_shape_strnew_shape_strsr  expandsizes
             r   r  r    sF    i&&:i+F+FFF?HIyeQXX**51yMI?HIyeQXX**51yMI% -=aH=->UG3tyy'?&@CC
CF]##c0B(BMM#1HC3;;MM&!  #m$$$$WAdii'(**% JI .s   )E )E
	E#Ec                  @   \ rS rSrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jr	S"S jr
S"S	 jrS"S
 jrS"S jrS"S jrS"S jrS"S jrS"S jrS#S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jr S"S jr!S r"g!)$TritonPrinteri  c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrN   libdevice.trunc(r   ).to(r  r   r  _printrL   r   index_dtyper  r   s     r   _print_TruncToIntTritonPrinter._print_TruncToInt  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r   c                    UR                   (       a  [        [        U5      5      nU$ [        R                  " 5       (       a$  [
        R                  R                  (       a  U nU$ SU S3nU$ )Nztl.full([], z, tl.float64))r   r   intr"   	is_fbcoder   versionhip)r  r   rets      r   _print_FloatTritonPrinter._print_Float#  s_    ?? c$i.C
 
	 EMM$5$5FC 
 !m4C
r   c                    [        UR                  5      S:X  d   eU R                  UR                  S   [        S   S-
  5      nU S3$ )NrN   r   Atom      ?z.to(tl.float64))r   r  parenthesizer   )r  r   r  s      r   _print_ToFloatTritonPrinter._print_ToFloat/  sI    499~"""diilJv,>,DEO$$r   c                   UR                   u  p#UR                  (       a8  UR                  (       a'  U R                  UR                   S[        S   S-
  5      $ U R	                  U5      nU R	                  U5      nSU SU S3$ )N % r  r  z!triton_helpers.remainder_integer(r  r  )r  is_nonnegative	stringifyr   r  r  r   quotdivquot_sdiv_ss         r   _print_PythonModTritonPrinter._print_PythonMod5  sp    II	3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr   c                ,   UR                   (       d   eUR                  u  p#UR                  (       a8  UR                  (       a'  U R                  UR                  S[        S   S-
  5      $ U R                  U5      nU R                  U5      nSU SU S3$ )N // r  r  z!triton_helpers.div_floor_integer(z,  r  )r   r  r  r  r   r  r  s         r   _print_FloorDivTritonPrinter._print_FloorDiv=  s|    II	3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr   c                P    U R                  UR                  S[        S   S-
  5      $ )N / r  r  )r  r  r   r  s     r   _print_IntTrueDivTritonPrinter._print_IntTrueDivH  s#    ~~dii
60BS0HIIr   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrN   libdevice.floor(r   r  r  r  r  s     r   _print_floorTritonPrinter._print_floorM  r  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r   _print_FloorToIntTritonPrinter._print_FloorToIntS  r  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrN   libdevice.ceil(r   r  r  r  r  s     r   _print_ceilingTritonPrinter._print_ceilingY  K    499~""" TYYq\!: ;5AUAU@VVWXXr   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r   _print_CeilToIntTritonPrinter._print_CeilToInt]  r#  r   c                ,    SU R                  U5       S3$ )Nztl.sqrt_rn(().to(tl.float32)))r  r  s     r   _helper_sqrtTritonPrinter._helper_sqrta  s    dkk$/00ABBr   c                    SU R                  UR                  S   5       SU R                  UR                  S   5       S3$ )Nlibdevice.pow(r   r  rN   r  )r  r  r  s     r   _print_FloatPowTritonPrinter._print_FloatPowd  s?    T[[167r$++diiPQl:S9TTUV	
r   c                6   UR                   S   R                  (       a;  S[        UR                   S   5       SU R                  UR                   S   5       S3$ SU R                  UR                   S   5       SU R                  UR                   S   5       S3$ )Nr   r,  r  rN   r  )r  
is_Integerfloatr  r  s     r   _print_PowByNatural!TritonPrinter._print_PowByNaturali  s    99Q<""#E$))A,$7#84;;tyyQR|;T:UUVWWT[[167r$++diiPQl:S9TTUV	
r   c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )Nr   rN   r!   	tl.where(r  r  )doprintr  )r  r   cpqs        r   _print_WhereTritonPrinter._print_Wherep  s_    LL1&LL1&LL1&1#Rs"QCq))r   c                   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  n[        U5      nU R                  U" UR                  SU 6 5      nU R                  U" UR                  US 6 5      n[	        S XV4 5       5      u  pVUS;   d   SU S35       eS	U S
U SU SU SU S
U SU SU S3$ )z1
Helper for max/min code generation.
cmp: > or <
rN   r   r!   Nc              3  .   #    U  H  nS U S3v   M     g7f)(r  Nr   r   rX  s     r   r  6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .v!q1Xvs   )><zUnexpected comparator: ''r>  z * ( z= z) + )))r   r  r  typer   )r  r   cmpmidr   abs          r   _print_min_max_helper#TritonPrinter._print_min_max_helperv  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .v..j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr   c                &    U R                  US5      $ )NrB  rK  r  s     r   
_print_MinTritonPrinter._print_Min      ))$44r   c                &    U R                  US5      $ )NrA  rN  r  s     r   
_print_MaxTritonPrinter._print_Max  rQ  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   tl_math.abs(r   r  r   r  r  r  s     r   
_print_AbsTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.cos((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_cos&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.cosh((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_cosh'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.acos((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_acos'TritonPrinter._print_OpaqueUnaryFn_acos  ra  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.sin((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_sin&TritonPrinter._print_OpaqueUnaryFn_sin  r]  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.sinh((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_sinh'TritonPrinter._print_OpaqueUnaryFn_sinh  ra  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.asin((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_asin'TritonPrinter._print_OpaqueUnaryFn_asin  ra  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.tan((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_tan&TritonPrinter._print_OpaqueUnaryFn_tan  r]  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.tanh((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_tanh'TritonPrinter._print_OpaqueUnaryFn_tanh  ra  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.atan((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_atan'TritonPrinter._print_OpaqueUnaryFn_atan  ra  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrN   zlibdevice.log2((r   r(  rW  r  s     r   _print_OpaqueUnaryFn_log2'TritonPrinter._print_OpaqueUnaryFn_log2  ra  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrN   zlibdevice.llrint(r   r  r  r  r  s     r   _print_RoundToIntTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  3$ )	Nr!   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got r   Mulzlibdevice.nearbyint(1e * z) * 1e)r   r  r   
ValueErrorr   r   )r  r   numberndigits
number_strs        r   _print_RoundDecimal!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr   r   N)r   r   r   r   )r   r   rG  r   r   r   )#r   r   r   r   r  r  r  r  r  r  r  r  r!  r%  r)  r-  r2  r:  rK  rO  rS  rX  r[  r_  rc  rf  ri  rl  ro  rr  ru  rx  r{  r  r   r   r   r   r  r    s    

%FGJ


YYC


*C&55;NOONOONOOO
Qr   r  c                *    [        [        U 5      5      $ )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)rF   rH   dtypes    r   triton_compute_typer    s    *5122r   c                `    U [         R                  :X  a  [         R                  n [        U 5      $ )z@Convert torch.dtype to triton type, with fix for storing tl.bool)r   r   int8rF   r  s    r   triton_store_typer    s"    



ur   c                    [        U 5      (       a1  U R                  (       a   U R                  S::  a  [        R                  $ [        U 5      $ )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizer   int32rH   r  s    r   upcast_acc_dtyper    s3    5??u~~7J{{u%%r   c                *    [        [        U 5      5      $ )z:Convert torch.dtype to triton type, with reduction upcasts)r  r  r  s    r   triton_acc_typer    s    /677r   c                F    U R                   S:*  =(       a    U R                  $ )Nr!   )r  is_floating_pointr  s    r   low_precision_fpr    s    >>Q:5#:#::r   c                    [        U [        5      (       d  gU R                  n[        U[        R                  5      (       a  [	        U5      $ S$ r  )r   rT   r  r   r  )r   r  s     r   low_precision_fp_varr    s<    c;''IIE&0&D&DE"O%Or   c                  J   ^  \ rS rSr S         SU 4S jjjrS rSrU =r$ )TritonCSEVariablei  c                h   > [         TU ]  XX4S9  [        5       U l        Uc   S5       eUc   S5       eg )Nr   z!TritonCSEVariable must have dtypez!TritonCSEVariable must have shape)super__init__r   r  )r  r   boundsr  r   	__class__s        r   r  TritonCSEVariable.__init__  sD     	u:*4, E"EE  E"EE r   c                p   U H  n[        U[        5      (       a'  U R                  R                  UR                  5        M?  [        U[        R
                  5      (       d  M`  [        R                   H<  n[        XE5      (       d  M  U R                  R                  [        U    S3/5          M     M     g )Nr  )
r   r  r  updater   r   r   r   r   r   )r  r   r  kwargsargr   s         r   update_on_args TritonCSEVariable.update_on_args  s    C#011%%cmm4C.. *55D%c00--*T2B1C4/H.IJ 6 r   )r  r   )
r   r   r  zValueRanges[Any]r  torch.dtyper   rr   r   r   )r   r   r   r   r  r  r   __classcell__r  s   @r   r  r    sS     !%FF !F 	F
 F 
F F r   r  c                     SSK Jn   U " 5       $ )Nr   rn   )!torch._inductor.dtype_propagationro   rn   s    r   get_dtype_handlerr    s    L%''r   c                :   ^ ^^ SS jmSU4S jjmSU UU4S jjnU$ )z
Codegen helper to upcast arguments to float32, depending on the config and dtype.
This decorates tl.math/libdevice codegen functions.
c                    [         R                  R                  (       + =(       aD    [        U [        5      =(       a-    U R
                  [        R                  [        R                  4;   $ r   )	r"   r   codegen_upcast_to_fp32r   rT   r  r   float16bfloat16)r   s    r   needs_upcast*maybe_upcast_float32.<locals>.needs_upcast  sD    444 =3,=		emmU^^<<	
r   c                2   > T" U 5      (       a  SOSnU  U 3$ )N.to(tl.float32)r   r   )r   upcast_stringr  s     r   maybe_upcast_arg.maybe_upcast_float32.<locals>.maybe_upcast_arg%  s$    -9#->->)B}o&&r   c                L   >^  [         R                  T T5        SUU UU4S jjnU$ )Nc                   > U  Vs/ s H  nT" U5      PM     nnUR                  5        VVs0 s H  u  pEUT" U5      _M     nnnT" U0 UD6nT=(       a7    [        U4S j[        R                  " XR	                  5       5       5       5      nU(       d  S O#[        [        5       TR                  5      " U 0 UD6n	U	[        R                  S 4;  n
U
(       a  U	b  S[        U	5       S3OSnU U 3$ s  snf s  snnf )Nc              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r   r  s     r   r  Kmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>4  s      6-ScS!!-S   .to(r  r   )itemsrz   	itertoolschainvaluesgetattrr  r   r   r   rF   )r  r  r  upcast_argskeyvalupcast_kwargsru  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringr   r   r  r  s               r   wrapped8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped-  s   <@ADS+C0DKAHNWHCS"23"77MW ;8-8F-  # 6-6__T==?-S6 3
 ( .0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   C0C5r0  )r   r   )r   r  r   r  r  s   ` r   	decorator'maybe_upcast_float32.<locals>.decorator)  s$    &&t^<	0 	0. r   r/  r0  )r   Callable[..., Any]r   r  r   )r   r  r  r  s   ` @@r   maybe_upcast_float32r    s    
' : r   c                     \ rS rSrSr\R                  " \R                  5      r\	  SR   SSS jj5       r
\	STS j5       r\	S 5       r\S 5       r\	\" 5       S	 5       5       r\	S
 5       r\	S 5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	S 5       r\	S 5       r\	S 5       r\	S 5       r\	S 5       r\	S\R:                  SSS.S j5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r \S 5       r!\	S 5       r"\	\" 5       S 5       5       r#\	\" 5       S 5       5       r$\	\" 5       S 5       5       r%\	\" 5       S 5       5       r&\	\" 5       S  5       5       r'\	\" 5       S! 5       5       r(\	\" 5       S" 5       5       r)\	\" 5       S# 5       5       r*\	\" 5       S$ 5       5       r+\	\" 5       S% 5       5       r,\	\" 5       S& 5       5       r-\	\" 5       S' 5       5       r.\	\" 5       S( 5       5       r/\	\" 5       S) 5       5       r0\	\" 5       S* 5       5       r1\	\" 5       S+ 5       5       r2\	\" 5       S, 5       5       r\	S- 5       r3\	\" 5       S. 5       5       r4\	S/ 5       r5\	S0 5       r6\	S1 5       r7\	S2 5       r8\	S3 5       r9\	S4 5       r:\	S5 5       r;\	S6 5       r<\	S7 5       r=\	S8 5       r>\	S9 5       r?\	S: 5       r@\	S; 5       rA\	S< 5       rB\	\" 5       S= 5       5       rC\	\" 5       S> 5       5       rD\	\" 5       S? 5       5       rE\	\" 5       S@ 5       5       rF\	\" 5       SA 5       5       rG\	SB 5       rH\	\" 5       SC 5       5       rI\	\" 5       SD 5       5       rJ\	\" 5       SE 5       5       rK\	\" SFSG9SH 5       5       rL\	\" SFSG9SI 5       5       rM\	\" 5       SJ 5       5       rN\	\" 5       SK 5       5       rO\	SL 5       rP\	SM 5       rQ\	\" 5       SN 5       5       rR\	SO 5       rS\	\" 5       SP 5       5       rTSQrUg)UTritonOverridesiI  zEMap element-wise ops to Triton e.g., ops.to_dtype(x,...) -> x.to(...)NTc                x         SS jnUb=  [        U" X!5      [        R                  R                  5      [        R                  l        U[        R
                  :X  a  SU  S3$ U[        R                  :X  a  Ub  UR                  (       d  Uc  U  S3$ U(       a  [        U5      nO[        U5      nU  SU S3$ )Nc                    X:X  a  g[         R                  [         R                  4nX;   a  X;   a  X:w  a   S5       eU [         R                  :X  d  U[         R                  :X  a  gU [         R                  :X  d  U[         R                  :X  a  gg)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r!   )r   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r   _get_min_elements_per_thread>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadU  s     % ##!!J '+*U U	U 
 E---e>O>O1OE///9@S@S3Sr   r>  z != 0)z.to(tl.int16).to(tl.uint8)r  r  )r  r  r  r  r   r  )
r{   rL   r   min_elem_per_threadr   r   uint8r  r  r  )rX  r  r  use_compute_typesr  	out_dtypes         r   to_dtypeTritonOverrides.to_dtypeN  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk!!i&A&AYEV S233+E2I)%0ID1%%r   c                    UR                   UR                   :X  d   eU R                  U:w  a  U  S[        U5       S3n U  S[        U5       S3n[        U5      U:w  a  U S[        [        U5      5       S3nU$ )Nr  r  z, bitcast=True))r  r  rF   rH   )rX  r  r  outs       r   to_dtype_bitcast TritonOverrides.to_dtype_bitcast  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r   c           	         [         R                  R                  U5      n[        U" U 5      5      n[	        U5      nUS:X  a  U$ U S:  a(  UR
                  (       d  SUSS   3nSU SU SU SU S3	$ SU SU SU S3$ )	Nz
tl.float32r   ztl.r  tl.full(r  r  r  )r   _prims_commondtype_to_typer`   r  r  )r  r  r   type_
triton_valrF   triton_signed_types          r   _shaped_constant TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0,& 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr   c                "    U R                  X/ S9$ )Nr  )r  )r   r  r  s      r   constantTritonOverrides.constant  s    ##E#;;r   c                    SU  S3$ )NrV  r  r   rX  s    r   absTritonOverrides.abs       aS""r   c                   [        U SS 5      n[        USS 5      nU[        R                  :X  a3  U[        R                  :X  a  [        R                  (       a
  SU  SU S3nO	SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R                  4;   a  U S[        U5       S3nU$ )Nr  ztriton.language.div_rn(r  r  r>  r  r  )
r  r   r   r"   emulate_divison_roundingr  r  truedivr  rF   )rX  yx_dtypey_dtyper  r  s         r   r  TritonOverrides.truediv  s    !Wd+!Wd+ u}}$5==(// ,A3b15CaSA3a.C""&:1&=&=)+33A9IU]]EMM::T+i"8!9;
r   c                    SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R
                  4;   a  U S[        U5       S3nU$ )Nr>  r  r  r  )r  r  modr   r  r   rF   )rX  r  r  r  s       r   r  TritonOverrides.mod  sp    !Cs!n""&:1&=&=)+//5IU]]EMM::T+i"8!9;
r   c                D    [         R                  (       a  SU  S3$ SU  S3$ )z
When use_fast_math, use the ftz (flushing to zero) variant
of exponent computation.

Check https://github.com/triton-lang/triton/issues/5735 for
more details.
ztl_math.exp(r  zlibdevice.exp()r"   use_fast_mathr  s    r   expTritonOverrides.exp  s+     !!A&&#A3a((r   c                    SU  S3$ )Nzlibdevice.exp2(r  r   r  s    r   exp2TritonOverrides.exp2       !1%%r   c                    SU  S3$ )Nzlibdevice.expm1(r  r   r  s    r   expm1TritonOverrides.expm1       "!A&&r   c                    SU  S3$ )Nztl.sqrt_rn(r  r   r  s    r   sqrtTritonOverrides.sqrt       QCq!!r   c                   [         R                  R                  nUS:X  a  gUS:X  a	  SU  SU  S3$ US:X  a  U  S3$ Uc:  [        R                  " [        R
                  " S	[        R                  5      U 5      $ [        S
U< 35      e)Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r"   r   inject_relu_bug_TESTING_ONLYopsmaximumr  r   r  rj  )rX  bugs     r   reluTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r   c                d    [         R                  R                  (       a	  SU  SU S3$ SU  SU S3$ )Nztl.minimum(r  , tl.PropagateNan.ALL)ztriton_helpers.minimum(r  r   r  r  rI  rJ  s     r   minimumTritonOverrides.minimum
  :    == 2aS(>??,QCr!A66r   c                d    [         R                  R                  (       a	  SU  SU S3$ SU  SU S3$ )Nztl.maximum(r  r  ztriton_helpers.maximum(r  r   r!  s     r   r  TritonOverrides.maximum  r$  r   c                    SU  SU SU S3$ )Nr5  r  r  r   )rI  rJ  r7  s      r   whereTritonOverrides.where  s    1#Rs"QCq))r   c           	       ^^	^
^ [         R                  R                  (       d   eXp2S nS nU" U5      (       a  U" U 5      n U" U5      (       a  U" U5      n      SUU	U
U4S jjn[        [         R                  R	                  5       5      S:  d   S5       e[        [        R                  [        R                     5      m	[        [        R                  [        R                     5      m
[        [        R                  [        R                     5      m[        [        R                  [        R                     5      m[         R                  R                  R                  [         R                  R                  U" U [!        U R"                  5      T
T/5      U R$                  T
T4S9n [         R                  R                  R                  [         R                  R                  U" U[!        UR"                  5      TT	/5      UR$                  TT	4S9n[&        R(                  R*                  R,                  R.                  S:X  a  SnOSnS	U  S
U SU S3$ )at  
Triton code generation for lowering ops.dot to tl.dot.

The logic is as follows:

1. Downcasting for performance
   If the data was previously upcasted to fp32, we downcast back to the
   original dtype (e.g., fp16 or bf16) for better performance. While
   surrounding operations may run in fp32, matmul itself is executed at the
   original precision to optimize throughput.

2. Handling non-constant reduction masks
   If the reduction mask is not constant and there was any operation between
   tl.load and tl.dot, we zero out regions outside the mask using
   tl.where(r0_mask, val, 0).
   This ensures that values outside the mask do not contribute to the dot
   product, preventing incorrect results.

3. Shape alignment for tl.dot
   We massage shapes to match the tl.dot requirement of (Y, R) x (R, X).
   Current codegen eagerly broadcasts tl.arange to create unique axes. We
   reshape, transpose, or broadcast to align with the (Y, R) x (R, X) shape.
   We avoid using 3D dot ((Z, Y, R) x (Z, R, X)) because 3D tl.dot has
   poor performance. During batched matmul (bmm), we keep ZBLOCK=1 and call
   the 2D dot kernel instead.
c                   [        [        [        U R                  5      5      (       d  g[        R
                  R                  S   nUR                  (       d   e[        R
                  R                  U5      (       a  g[        R
                  R                  R                  R                  5        H  u  p#X0:X  d  M  SU;   d  M  SU;   d  M    g   g)NFrU  ztl.loadz	other=0.0T)rz   r,  rB   r  rL   r   ro  rf  _has_constant_maskr   _cacher  )r   reduction_rangekvs       r   is_where_needed,TritonOverrides.dot.<locals>.is_where_needed;  s    s.>??hh2226O"//// xx**?;;
 ++1138	Q;!3C  4 r   c                   [         R                  R                  SU R                  5      n[        R
                  R                   Vs/ s H%  nUR                  (       d  M  UR                   S3PM'     nn[        U5      S:X  d   S5       e[        R                  US   X5      n[        R
                  R                  R                  [        R
                  R                  X@R                  U R                  S9$ s  snf )Ndotr  rN   z'don't tile reduction when native matmulr   r  r   )r#   	Reductiondefault_valuer  rL   r   ro  rf  r   r   TritonKernelOverridesr(  r   generatecomputer   )r   defaultr   reduction_mask	where_vars        r   
where_cond'TritonOverrides.dot.<locals>.where_condP  s    ll00		BG HH000D$$ %4;;-t$0   ~&!+V-VV+-33N14EsTI88<<((  )99CII )  s   C9 C9c                  > T	U;   a  U Vs/ s H  o3T	:X  a  SOUPM     nnUTT/:X  aO  TU;  d   S5       eSS/nTU;   a  TUS'   TU;   a  TUS'   [        XU5      n UTT/:g  nU(       a  SU  ST ST S3n U $ UTT/:X  aU  TU;  d   S	5       eSS/nTU;   a  TUS'   TU;   a  TUS'   [        XU5      n S
U  S3n UTT/:g  nU(       a  SU  ST ST S3n U $ [        es  snf )aX  
Generate a reshape, transpose, and broadcast for the tl.dot.
tl.dot requires specific shape requirement : (Y,R) x (R,X)
but the current triton codegen eagerly broadcast the tl.arange so
it needs to be reshaped to meet the requirement.

This is done by three steps.
1. remove the empty dimension (dim with size 1) and make it 2d with tl.reshape
2. permute the dimension if needed (e.g., (X,R) -> (R,X)) with tl.trans
3. broadcast if needed with broadcast_to.
    - This shows up when matmul operand is broadcasted with torch.expand/repeat.
    - e.g., torch.rand((16,)).expand(16,16) @ B

e.g., (Y,1,R), (Y,R) -> tl.reshape(var, (Y,R))
e.g., (1,X,R), (R,X) -> tl.trans(tl.reshape(var, (X,R)))
e.g., (1,X,1), (R,X) -> tl.broadcast_to(tl.trans(tl.reshape(var, (X,1))), (R,X))

TODO : eventually we want to remove this function when lazy broadcasting arrives
r   z&left tl.dot operand cannot depend on xr   rN   r  z, (r  rE  z'right tl.dot operand cannot depend on yr  r  )r  NotImplementedError)
r  r  r=  rq  shape_2dbroadcast_neededRBLOCKr   r   r   s
         r   #reshape_transpose_broadcast_for_dot@TritonOverrides.dot.<locals>.reshape_transpose_broadcast_for_dotp  sw   6 &JW X-3v3!>- Xvv..]2 <2  :]*"(HQK]*"(HQK 'uXF $,/?#? #.ugS6("ME4 L1  00]2 =2  :]*"(HQK]*"(HQK 'uXF $E7!, $,/?#? #.ugS6("ME L *)W !Ys   Cr   ztl.dot can only do mm and bmmr5  tf32ieeeztl.dot(r  z, input_precision=""))r  r9  r=  r9  r   r   )rL   r   is_native_matmulr   dense_size_listr   r   r   r   r   r   r   r   r   r9  r:  rk  r   r  r   backendscudamatmulfp32_precision)rI  rJ  orig_aorig_br1  r>  rE  input_precisionrD  r   r   r   s           @@@@r   r4  TritonOverrides.dot  s   8 xx((((	*	6 6""1A6""1AI	/I	 .I	 	I	 I	V 188++-.!3T5TT3]..t{{;<]..t{{;<]..t{{;<]..t}}=>HHLL!!HH/4=66BRS''6"	 " 
 HHLL!!HH/4=66BRS''6"	 " 
 >>%%44>$O$O2aS 3O3DBGGr   rN   )constraintsr  is_purepackc                    [        U5      nSR                  U Vs/ s H  n[        U5      PM     sn5      nUc&  SR                  S/U V	s/ s H  n	SPM     sn	-   5      nSU  SU SU SU SU S	U S
3$ s  snf s  sn	f )Nr  z=rr$  ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r  )r  r*  r   )
asmrT  r  rU  rV  inputsrF   i
input_refs_s
             r   inline_asm_elementwise&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-B6ac6-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A5A:
c                    SU  S3$ )Nztl_math.cos(r  r   r  s    r   cosTritonOverrides.cos  r  r   c                    SU  S3$ )Nztl_math.sin(r  r   r  s    r   sinTritonOverrides.sin  r  r   c                    [        S5      e)Nz/ops.index_expr not implemented outside a kernelrA  )r   r   r  s      r   
index_exprTritonOverrides.index_expr  s    !"STTr   c                    [        S5      e)Nz+ops.masked not implemented outside a kernelrf  )r  bodyothers      r   maskedTritonOverrides.masked  s    !"OPPr   c                    SU  S3$ )Nzlibdevice.lgamma(r  r   r  s    r   lgammaTritonOverrides.lgamma       #1#Q''r   c                    SU  S3$ )Nzlibdevice.erf(r  r   r  s    r   erfTritonOverrides.erf        s!$$r   c                    SU  S3$ )Nzlibdevice.cosh(r  r   r  s    r   coshTritonOverrides.cosh  r  r   c                    SU  S3$ )Nzlibdevice.sinh(r  r   r  s    r   sinhTritonOverrides.sinh  r  r   c                    SU  S3$ )Nzlibdevice.acos(r  r   r  s    r   acosTritonOverrides.acos  r  r   c                    SU  S3$ )Nzlibdevice.acosh(r  r   r  s    r   acoshTritonOverrides.acosh  r  r   c                    SU  S3$ )Nzlibdevice.asin(r  r   r  s    r   asinTritonOverrides.asin  r  r   c                    SU  S3$ )Nzlibdevice.asinh(r  r   r  s    r   asinhTritonOverrides.asinh  r  r   c                    SU  SU S3$ )Nzlibdevice.atan2(r  r  r   rX  r  s     r   atan2TritonOverrides.atan2       "!Bqc++r   c                    SU  S3$ )Nzlibdevice.atan(r  r   r  s    r   atanTritonOverrides.atan   r  r   c                    SU  S3$ )Nzlibdevice.atanh(r  r   r  s    r   atanhTritonOverrides.atanh%  r  r   c                    SU  SU S3$ )Nzlibdevice.copysign(r  r  r   r  s     r   copysignTritonOverrides.copysign*  s     %QCr!A..r   c                    SU  S3$ )Nzlibdevice.erfc(r  r   r  s    r   erfcTritonOverrides.erfc/  r  r   c                    SU  S3$ )Nzlibdevice.erfinv(r  r   r  s    r   erfinvTritonOverrides.erfinv4  rq  r   c                    SU  SU S3$ )Nzlibdevice.hypot(r  r  r   r  s     r   hypotTritonOverrides.hypot9  r  r   c                    SU  S3$ )Nzlibdevice.log10(r  r   r  s    r   log10TritonOverrides.log10>  r  r   c                    SU  S3$ )Nzlibdevice.log2(r  r   r  s    r   log2TritonOverrides.log2C  r  r   c                    SU  SU S3$ )Nzlibdevice.ldexp(r  z.to(tl.int32))r   )rX  ns     r   ldexpTritonOverrides.ldexpH  s    !!Bqc88r   c                    SU  SU S3$ )Nzlibdevice.nextafter(r  r  r   r  s     r   	nextafterTritonOverrides.nextafterL  s     &aS1#Q//r   c                    U  SU 3$ Nr)  r   r!  s     r   logical_andTritonOverrides.logical_andQ      Cs|r   c                    U  S3$ )Nz == 0r   rI  s    r   logical_notTritonOverrides.logical_notU  s    E{r   c                    U  SU 3$ Nz | r   r!  s     r   
logical_orTritonOverrides.logical_orY  r  r   c                    SU  SU S3$ )Nr>   ^ r  r   r!  s     r   logical_xorTritonOverrides.logical_xor]  s    1#S1~r   c                    U  SU 3$ r  r   r!  s     r   bitwise_andTritonOverrides.bitwise_anda  r  r   c                    SU  3$ )N~r   r  s    r   bitwise_notTritonOverrides.bitwise_note  s    1#wr   c                    U  SU 3$ r  r   r!  s     r   
bitwise_orTritonOverrides.bitwise_ori  r  r   c                    U  SU 3$ )Nr  r   r!  s     r   bitwise_xorTritonOverrides.bitwise_xorm  r  r   c                    U  SU 3$ )Nz << r   r!  s     r   bitwise_left_shift"TritonOverrides.bitwise_left_shiftq      D}r   c                    U  SU 3$ )Nz >> r   r!  s     r   bitwise_right_shift#TritonOverrides.bitwise_right_shiftu  r  r   c                     SU S3nSU  SU S3$ )Nr>  ).to(tl.uint32)ztl.rand(r  r  r   seedr   s     r   randTritonOverrides.randy  s%    VHO,$r&++r   c                     SU S3nSU  SU S3$ )Nr>  r  z	tl.randn(r  r  r   r  s     r   randnTritonOverrides.randn~  s%    VHO,4&6(!,,r   c           	     ,    SU S3nSU  SU SU SU S3	$ )Nr>  r  ztriton_helpers.randint64(r  r  r   )r  r   lowhighs       r   	randint64TritonOverrides.randint64  s1    VHO,*4&6("SED6KKr   c                    [        S5      e)Nz.ops.load_seed not implemented outside a kernelrf  )r   r   s     r   	load_seedTritonOverrides.load_seed  s    !"RSSr   c                X    [         R                  R                  (       a  SU  S3$ SU  S3$ )Nz	tl.rsqrt(r  zlibdevice.rsqrt(r   r  s    r   rsqrtTritonOverrides.rsqrt  s/     ==qc##%aS**r   c                    SU  S3$ )Nzlibdevice.log1p(r  r   r  s    r   log1pTritonOverrides.log1p  r  r   c                    SU  S3$ )Nzlibdevice.tan(r  r   r  s    r   tanTritonOverrides.tan  ru  r   c                   [         R                  R                  R                  R	                  U 5      nU(       a  [        US5      (       a  UR                  nOS n[        R                  (       aJ  [        R                  R                  (       a+  [        5       S:  a  U[        R                  :w  a	  Ub  SU  S3$ SU  S3$ )Nr  )r      zlibdevice.fast_tanhf(r  zlibdevice.tanh()rL   r   r   r   getr   r  r"   r  r   r  r  r   r   )rX  r   r  s      r   tanhTritonOverrides.tanh  s     ((,,**..q1ww00MMEE  !!"$v-&! +1#Q//$QCq))r   c                    SU  S3$ )Nztl.sigmoid(r  r   r  s    r   sigmoidTritonOverrides.sigmoid  r  r   c                    SU  SU  SU  S3$ )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r   signbitTritonOverrides.signbit  s#     "!L3NqcQUV	
r   c                    SU  SU S3$ )Nzlibdevice.fmod(r  r  r   r!  s     r   fmodTritonOverrides.fmod  s     !2aS**r   c                    SU  SU S3$ )Nr,  r  r  r   r!  s     r   powTritonOverrides.pow  s      s"QCq))r   c                    SU  S3$ )Nztl_math.log(r  r   r  s    r   logTritonOverrides.log  r  r   F)r   c                    SU  S3$ )Nzlibdevice.isinf().to(tl.int1)r   r  s    r   isinfTritonOverrides.isinf       "!M22r   c                    SU  S3$ )Nzlibdevice.isnan(r   r   r  s    r   isnanTritonOverrides.isnan  r  r   c                    SU  S3$ )Nzlibdevice.nearbyint(r  r   r  s    r   roundTritonOverrides.round  s     &aS**r   c                    SU  S3$ )Nr  r  r   r  s    r   floorTritonOverrides.floor  r  r   c                H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r  r   )rI  rJ  r  rems       r   floordivTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr   c                l   [         R                  " S[        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " X#5      nU SU  S3$ )Nr   r  .dtype))r  r  r   r  r  ltr  sub)rX  zleftrightr  s        r   signTritonOverrides.sign  so    LLEKK(||SVVA\EJJ7cffQlUZZ8ggd"d1#W%%r   c                    SU  S3$ )Nr  r  r   r  s    r   truncTritonOverrides.trunc  r  r   c                    U  SU 3$ )Nr  r   r!  s     r   truncdivTritonOverrides.truncdiv  s     D}r   c                    SU  S3$ )Nr   r  r   r  s    r   ceilTritonOverrides.ceil  r  r   r   NT)r  r  r  Optional[torch.dtype])r  r  r  r  )Vr   r   r   r   r   mathr  e_LOG_2_Estaticmethodr  r  r  r   r  r  r  r  r  r  r	  r  r  r  r"  r  r(  r4  r   r   r]  r`  rc  rg  rl  ro  rs  rw  rz  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r   r   r   r   r  r  I  s   Oyy H ,0	:&:& ):& :&x    D D" < < #  #  ,   )  ) &  & '  ' "  "  " 7 7 7 7 * * xH xHt "&emmTPQK K #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 9 9 0  0                     , , - - L L T T +  + '  ' %  % *  *& "  " 
 
 +  + *  * #  # /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r   r  r   c                     ^  \ rS rSrSrU 4S jr\\R                  S 5       5       r	\S 5       r
\S 5       r\S 5       r\S 5       r\S	 5       r\          SS
 j5       rSrU =r$ )r8  i  a  Map element-wise ops to Triton within a TritonKernel

Unlike TritonOverrides, these assume the code is going to be inserted into
the body of the main triton kernel and so it may use indexing and mask
variables which are assumed to already be defined in the current scope.
c                F   > [         TU ]  " U0 UD6  U R                  5         g r   )r  r  _setup_libdevice_routing)r  r  r  r  s      r   r  TritonKernelOverrides.__init__  s#    $)&) 	%%'r   c                  ^ SSK Jm  [        R                  R                  R
                   H  n[        X5      (       d   e[        X5      nU4S jnUS:X  aE  [        TS5      (       d   e[        R                  " X2US9nXl
        [        X[        U5      5        Mq  S n[        R                  " XRUS9nXl
        [        X[        U5      5        M     g)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                   > U R                   [        R                  :w  a  U" U 5      $ [        TU5      " U 5      R                  $ r   )r  r   r   r  r  )rX  _original_impl_fn_namer/  s      r   decomposition_routerLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_router!  s7    77emm+)!,,"#3X>qAGGGr   r  )r1  r2  c                `    U R                   [        R                  :X  a	  SU SU  S3$ U" U 5      $ )Nz
libdevice.r>  r  )r  r   r   )rX  r1  r2  s      r   dtype_routerDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_router0  s2    77emm+'z1#Q77)!,,r   N)torch._inductor.codegen.commonr/  r   	_inductorr%   op_requires_libdevice_fp64r   r  	functoolspartialr   setattrr)  )r   fn_nameoriginal_implr3  fnr6  r/  s         @r   r,  .TritonKernelOverrides._setup_libdevice_routing  s    
 	D,,GGG3((((#C1MH )#/;;;;&&(QX &l2&67- ""WB "KC,r"23; Hr   c                j    [         R                  R                  5       nS/U-  nU R                  XUS9$ )NrN   r  )rL   r   r   r  )r   r  r  r   r   s        r   r  TritonKernelOverrides.constant<  s7    
 xx**,d
##E#>>r   c                   [         R                  R                  USS S9n[        U[        5      (       d   eUR
                  (       a  UR
                  nO[        R                  UR                  5      n[         R                  R                  5       nU[        R                  [        R                  4;  a  UOUn[        R                  R                  n S[        R                  l        [         R                  R                   R#                  [         R                  R$                  UR&                  [)        U5      UUS9nU[        R                  l        U[        R                  [        R                  4;  ae  [         R                  R                   R#                  [         R                  R$                  U R+                  Xr5      [-        U5      UR.                  S9nOUnUR0                   Ht  n[3        U[4        R6                  5      (       d  M$  [        R8                  " U[         R                  R                   R:                  UR<                     R>                  5      nMv     X%:w  a[  [         R                  R                   R#                  [         R                  R$                  U R+                  Xu5      UUR.                  S9nUR@                  Ul         U$ ! U[        R                  l        f = f)NF	block_ptrtma_compatibility_checkerr  r  r   r5  )!rL   r   indexingr   r  r	  r   r   r  get_index_dtype_as_torch_dtyper   r  int64r"   test_configsruntime_triton_dtype_assertr   r9  r:  r  r=   r  rH   r   r   r   r   r   promote_typesr   r   r  r  )	r   r   r  rI  r   r  origr   	index_vars	            r   rg   TritonKernelOverrides.index_exprE  s,   88$$ET % 
 (O4444   ))E!11(..AE hh==?u{{EKK&@@k "">>
	C>CF;((,,''  "",T2 ( C ?CF;ekk22((,,''  S()%0ii	 ( C  E!..	!)TXX66!//qxx||77	GMME / #hhll++HH$$LL2%))	 ,  !**
= ?CF;s   A+K K.c           
     <   U bx  [         R                  R                  b]  [        R                  R
                  R                  [        R                  R                  U  S3[         R                  U R                  S9n UR                  R                  SS9nU(       d   S5       eSnU HH  nUR                   H5  nUR                  S:w  d  [        UR                  S   5      (       d  M2  S	n  MF     MJ     U(       a  S OUn[        R                  R                  XS
9 nU" 5       n	S S S 5        U(       a  W	R                   R"                  (       a  [        U5      n[        R                  R
                  R                  [        R                  R                  SU	 S[%        U5       SU	 S3[&        R(                  " U5      U	R*                  U	R                  S9n[,        R.                  " WX5      n
OW	n
U
R0                  R3                  W5        U
$ ! , (       d  f       N= f)N.to(tl.int1)r5  output)opz)graph for body does not contain an outputFloadrN   Tr  r  z.shape, r  r  rH  )r   r  r  rL   r   r   r9  r:  r   r   rZ  
find_nodesr  targetrj   
mask_loadsr  is_boolr`   r    wrapr  r  r(  r  discard)r  rj  rk  nodes
need_wherenoder  r  new_maskru  r  s              r   rl  TritonKernelOverrides.masked  s    1 1 =88<<((  &%jjjj	 ) D 

%%%2AAAu
 Dyy::'+CCHHQK+P+P!%J !  #XX   3xVF 4 }}$$UHHLL))  6((=+?*@6('R"''.llll * E ))Hf4CCh'
) 43s   H
Hc                    [         R                  R                  R                  U 5      nSU S[         R                  R                  R	                  SU5       S3$ )Ntl.load( + load_seed_offsetr  )rL   r   r  inputseed_offset)r   r   r   s      r   r  TritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r   c                *   SU  S3n[         R                  R                  R                  U5      =n(       a  U$ [         R                  R                  R	                  U R
                  U R                  S9n[         R                  R                  R	                  [        R                  U R                  S9n[         R                  R                  R                  U SU SU  S35        [         R                  R                  R                  XU45        X44$ )Nzfrexp(r  r5  r  z = triton_helpers.frexp()rL   r   r   try_getnewvarr  r   r   r  r:  r   put)rX  	cache_keycse_valmantissaexponents        r   frexpTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWWAGG&D88<<&&U[[&H	""j8*$<QCqA	
 	
x$89##r   c                    [         er   rf  )r   r   r  
extra_metas       r   partial_accumulate(TritonKernelOverrides.partial_accumulate  s
     "!r   r   )
r   r   r   r   r  rT   ru  dict[str, Any]r   r   )r   r   r   r   r   r  r   r;  cacher,  r  rg  r)  rl  r  rr  rv  r   r  r  s   @r   r8  r8    s    ( __"4  "4H ? ? ; ;z , ,\ 
 
 $ $ """ " #	"
 
" "r   r8  c                  V    \ rS rSr% SrS\S'   S\S'   SS jrSS	.SS
 jjrS rS r	Sr
g)HelperFunctionsi  z#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     0 U l         / U l        g r   r|  r~  r  s    r   r  HelperFunctions.__init__  s    !!#r   _triton_helper_fn	base_namec                   U R                   R                  U5      nUb  U$ U [        U R                  5       3nX@R                   U'   U R                  R	                  UR                  US95        U$ )a  This accepts a function definition with the function name
left as a format specifier e.g.

    @triton.jit
    def {name}(arg0, arg1):
        return arg0 + arg1

We add the templated code to the function set and return the name
assigned to that function.

)r   )r|  r  r   r~  r  r  )r  template_coder  existing_namer   s        r   addHelperFunctions.add  su     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr   c                ,    [        U R                  5      $ r   )iterr~  r  s    r   __iter__HelperFunctions.__iter__  s    D**++r   c                     U R                   U   $ r   )r~  )r  r  s     r   __getitem__HelperFunctions.__getitem__  s    %%c**r   r  Nr   r   )r  r   r   r   )r   r   r   r   r   r   r  r  r  r  r   r   r   r   r{  r{    s+    -##  $ 4G ,,+r   r{  c                     \ rS rSr% Sr\R                  " \S9rS\	S'   \R                  " \S9r
S\	S'   \R                  " \S9rS\	S'   \R                  " \S9rS\	S'   \R                   " S	 S
5      5       r\R                   " S S\5      5       r\R                   " S S\5      5       rSS jr      SS jrSS jrSrg)r4  i  zE
Class representing ND block dimensions, for block pointer analysis.
)default_factoryrY  r   rE  rH  rL  c                      \ rS rSr% S\S'   S\S'   \R                  " SS9rS\S'   S r\	S	 5       r
\\      SS
 j5       5       rS rS rSrg)r>  i  r7  original_stridesr  F)initr  c                   [        U R                  5      S:  d   e[        U R                  5      [        U R                  5      :X  d   e[        [	        [        U R                  5      5      5      nU R                  U:H  U l        [        U R                  5       VVs0 s H  u  p#X2_M	     nnn[	        [        U5      5       Vs/ s H  nXB   PM	     snU l        g s  snnf s  snf r  )r   r  r  rk  rm  _is_identity	enumerater  )r  identity_sort_idxrZ  r/  sorted_dims_by_strides_maps        r   __post_init__*BlockParameters.StrideSorter.__post_init__
  s    t,,-111t}}%T-B-B)CCCC $U3t/D/D+E%F G $1B BD <ET]];S)T;S41!$;S&)T s#=>?$?A +-?$D  *U$s   CC!c                    U R                   $ r   )r  r  s    r   r  (BlockParameters.StrideSorter.is_identity  s    $$$r   c                    g)zBCreate a `StrideSorter` that can be used to sort block parameters.Nr   r   r  rW  s      r   rv  #BlockParameters.StrideSorter.create  s    r   c                p    U R                   (       d  U R                   Vs/ s H  o!U   PM	     sn$ U$ s  snf r   r  r  r  attrrZ  s      r   sort!BlockParameters.StrideSorter.sort#  0    ##)-7AQ77K 8   3c                p    U R                   (       d  U R                   Vs/ s H  o!U   PM	     sn$ U$ s  snf r   r  r  s      r   r  #BlockParameters.StrideSorter.revert(  r  r  )r  r  Nr  zlist[Union[int, sympy.Expr]]rW  rp   r   r>  )r   r   r   r   r   dataclassesfieldr  r  r1  r  r   r   rv  r  r  r   r   r   r   StrideSorterBlockParameters.StrideSorter  s    ##%0%6%6E%BB	 
	% 
	% 
		U#?	ULT	U)	U 
 
	U
	
	r   r  c                  H   ^  \ rS rSrU 4S jr\      SS j5       rSrU =r$ )$BlockParameters.IdentityStrideSorteri-  c                "   > [         TU ]  5         g r   r  r  r  r  s    r   r  2BlockParameters.IdentityStrideSorter.__post_init__/      G!#r   c           
     F    U " U[        [        [        U5      5      5      S9$ )Nr  r  )rk  rm  r   r  s      r   rv  +BlockParameters.IdentityStrideSorter.create2  s'     !1eC(8$9:; r   r   r  )	r   r   r   r   r  r   rv  r   r  r  s   @r   IdentityStrideSorterr  -  s4    	$ 
	#?	LT	)	 
	r   r  c                  L   ^  \ rS rSrSrU 4S jr\      SS j5       rSrU =r	$ )+BlockParameters.TensorDecriptorStrideSorteri;  zD
Sorts BlockParameters dimensions with strides in descending order.
c                "   > [         TU ]  5         g r   r  r  s    r   r  9BlockParameters.TensorDecriptorStrideSorter.__post_init__A  r  r   c                    [        [        [        U5      5      5      n [        R                  " X!SS9nU " UUS9$ ! [
         a    Un Nf = f)aV  
If the strides are not all known constants or if the strides are already
sorted in descending order, return identity sort.

For example if block_shape @ strides is [ZBLOCK, XBLOCK, YBLOCK] @ [8, 1, 16]
The indices to sort the strides in descending order will be [2, 0, 1].
The indices to revert back to the original order will be [1, 2, 0].
T)reverser  )rk  rm  r   r%   ri  rj  )r   r  rW  identity_sortr  s        r   rv  2BlockParameters.TensorDecriptorStrideSorter.createD  s^     !s+;'<!=>M	) !,,YRVW
 !1! 	 " )()s   ; A
	A
r   r  )
r   r   r   r   r   r  r   rv  r   r  r  s   @r   TensorDecriptorStrideSorterr  ;  s9    		$ 
	#?	LT	)	 
	r   r  c                    [        U 5      n[        S X4 5       5      u  p4U" S0 U Vs0 s H  oUX5   XE   -   _M     snD6$ s  snf )z 
Concatenates block parameters.
c              3  N   #    U  H  n[         R                  " U5      v   M     g 7fr   )r  asdictr?  s     r   r  *BlockParameters.__add__.<locals>.<genexpr>f  s     BMq[''**Ms   #%r   )rF  r   )r  rk  r   rI  rJ  r  s         r   __add__BlockParameters.__add__a  sL     4jBTMBB9a8as16AF?*a8998s   Ac                    UR                  U R                  US9n[        S0 [        R                  " U 5      R                  5        VVs0 s H  u  pEXCR                  U5      _M     snnD6nXc4$ s  snnf )z
Sort `BlockParameter` with stride_sorter_cls. Returns block parameters
as well as a `StrideSorter` which contains information on how the sort
can be reverted.
)rW  r   )rv  rH  r4  r  r  r  r  )r  rV  rW  r?  r  r  r5  s          r   r^  ,BlockParameters.maybe_sort_with_stride_orderi  s}     *000S  
 !, 2 24 8 > > @ @HC '',, @
 $$s   	A/c                   ^ U4S jn[        S0 [        R                  " U 5      R                  5        VVs0 s H  u  p4X2" U5      _M     snnD6$ s  snnf )z1
Remove dimensions where removable_dims is True.
c                d   > [        U T5       VVs/ s H  u  pU(       a  M  UPM     snn$ s  snnf r   )r  )ititemis_removableremovable_dimss      r   filter_dims0BlockParameters.remove_dims.<locals>.filter_dims  s7     +.b.*A*A&D# *A  s   ,,r   )r4  r  r  r  )r  r  r  r  r  s    `   r   r]  BlockParameters.remove_dimsz  sS    
	  
5@5G5G5M5S5S5UV5UsK$$5UV
 	
Vs   Ar   N)rk  r4  r   r4  )rV  ztype[StrideSorter]rW  rp   r   z4tuple[BlockParameters, BlockParameters.StrideSorter])r  r;  r   r4  )r   r   r   r   r   r  r  rk  r   r   rE  rH  rL  	dataclassr  r  r  r  r^  r]  r   r   r   r   r4  r4    s     *//EEE$/$5$5d$KK!K + 1 1$ GGG + 1 1$ GGG& & &P |   #l # #J:%!3%@H%	=%"
r   r4  c                  4    \ rS rSrSrS rS	S jrS rS rSr	g)
"CooperativeReductionWorkspaceCachei  z
The scratch space used for cooperative reductions can be reused
after two reduction loops.  This keeps track of what can be reused.
c                    Xl         / U l        / U l        [        R                  " [        R
                  5      U l        SU l        SU l        g r  )	r  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r  r  s     r   r  +CooperativeReductionWorkspaceCache.__init__  s>    	*66{7H7HIr   c                    U R                   R                  U5      nU(       a  UR                  5       $ U R                  R	                  US5      u  p4nU R
                  R                  XU45        X54$ r  )r  r  popleftr  	workspacer  r  )r  nbytescachedws_namer\  	ws_offsets         r   allocate+CooperativeReductionWorkspaceCache.allocate  se    %%))&1>>## $		 3 3FE BI  &9!=>##r   c                    U R                    H%  u  pnU R                  U   R                  X#45        M'     U R                  U l         / U l        U =R                  S-  sl        g NrN   )r  r  r  r  r  )r  r  r  r  s       r   on_loop_end.CooperativeReductionWorkspaceCache.on_loop_end  sT    *.//&FY  (//0DE +:++1r   c                H    U R                   nU =R                   S-  sl         U$ r  )r  )r  priors     r   increment_store_count8CooperativeReductionWorkspaceCache.increment_store_count  s#      Ar   )r  r  r  r  r  r  N)r  r   )
r   r   r   r   r   r  r  r  r  r   r   r   r   r  r    s    
$r   r  c                  ,    \ rS rSr% S\S'   S rS rSrg)FixedTritonConfigi  zdict[str, int]r"   c                     U R                   U   $ r   r"   r  r  s     r   r  FixedTritonConfig.__getitem__  s    {{4  r   c                    XR                   ;   $ r   r  r  s     r   __contains__FixedTritonConfig.__contains__  s    {{""r   r   N)r   r   r   r   r   r  r  r   r   r   r   r  r    s    !#r   r  c                  "    \ rS rSrSrSS jrSrg)	TritonCSEi  zy
Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
variables across separate masked blocks.
c                b    [         R                  R                  =n(       a  XR                  4$ U$ r   )rL   r   
_load_maskr   )r  rn  r  s      r   augment_keyTritonCSE.augment_key  s*    88&&&4&yy))r   r   N)rn  r   r   zUnion[str, tuple[str, str]])r   r   r   r   r   r  r   r   r   r   r  r    s    
r   r  c                  r    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
 r  SS jr    SS jrSS jr	Sr
g)TMACompatibilityCheckeri  zG
Checks if the TMA API can be used for load / store triton operations.
TritonKernelr   r  r  r   r  forcec                    SU l         g )Nz2Cannot use TMA descriptor for load / store since: failed_debug_prefixr  s    r   r  %TMACompatibilityChecker.__post_init__  s
    #W r   c                ,   U R                   (       a  g[        R                  R                  5       R                  S:X  ah  [
        R                  R                  5       S   S:  aC  [        R                  R                  (       a$  [        R                  (       a  [        5       (       d!  [        R                  SU R                  5        gU R                   (       a<  U R"                  R$                  (       a!  [        R                  SU R                  5        gg)NTrM  r   	   z}%s Requires triton>=3.4.0, a CUDA device with cc>=9.0 and `use_tensor_descriptor` and `assume_aligned_inputs` options enabledFz/%s stores with `no_x_dim` cannot load 16 bytes.)r  rL   rZ  get_current_device_or_throwrF  r   rM  get_device_capabilityr"   r   use_tensor_descriptorassume_aligned_inputsr   r  debugr
  r  r   r`  r  s    r   can_use_tma#TMACompatibilityChecker.can_use_tma  s     ::GG//166&@

00215:33,,)++ II[ ((  >>dkk22IIA(( r   c           
     
   U R                   (       aD  UR                   Vs/ s H,  n[        R                  R                  R                  U5      PM.     nnOUR                  n[        R                  R                  R                  US   [        R                  " S5      5      (       d"  [        R                  SU R                  U5        gU R                  R                  nUSS  H  n[        R                  R                  R                  [        XT-  S[        R                  " S5      5      [        R                  " S5      5      (       a  Mi  [        R                  SU R                  UU5          g   UR                  S   n[        R                  R                  R                  U[        R                  " S5      5      (       a,  [        R                  S	U R                  UR                  5        gSnSnUR                    H1  n	["        R$                   H  n
['        X5      (       d  M  U	nU
n  M/     M3     U(       a  U(       d   U S
["        R$                   35       eU R(                  R*                  (       Ga  U R,                  (       Gd   [.        U   nSnU R(                  R0                   H4  nUR2                  (       d  M  UR4                  U:X  d  M(  UR6                  n  O   Uc   eU R(                  R9                  U5      nUR;                  X~05      U-  n[        R                  R                  R=                  U[        R                  " S5      5      (       d-  [        R                  SU R                  UR                  U5        g g  S       SS jjnXd-  S-
  nUR?                  [@        U5      R?                  [        U5      n[C        [E        [        RF                  " UUS5      5      5      nUU R(                  RI                  [.        U   5      :  a#  [        R                  SU R                  UU5        gU R(                  RK                  U5      n	U R(                  RL                  (       aZ  UU R(                  RL                  U	   :  a;  [        R                  SU R                  U	U R(                  RL                  U	   U5        g g[O        UU R(                  RP                  RS                  U	S5      5      U R(                  RP                  U	'    gs  snf ! [T         a.    [        R                  SU R                  UR                  5         gf = f)z
Check if the block parameters are valid for TMA.
If force, we allow relying on symbolic hints equivalent
to what we check for Triton templates.
rU  rN   z=%s TMA API requires innermost stride to be 1. Strides are: %sFN   r   zU%s TMA API requires outer strides to be 16 byte aligned. Dtype bytes: %d, strides: %sz>%s innermost block shape cannot load 16 bytes. Block shape: %sz, expr must contain a single block type from zj%s persistent reduction innermost block shape cannot load 16 bytes. Block shape: %s, persistent RBLOCK: %dc                $    X-  nU(       a  X2-  nU$ r   r   )rX  r  r  r	  s       r   indexing_div_repQTMACompatibilityChecker.are_block_parameters_compatible.<locals>.indexing_div_repd	  s    
 %C!gJr   zC%s the minimum block size to satisfy expression %s is too large: %dzT%s For block %s, fixed config block size %d is smaller than the minimum required: %dz?%s innermost block shape cannot load 16 bytes. Block params: %sTr   )rX  r   r  r   r  zOptional[sympy.Expr]r   r   )+r  rH  rL   rZ  rR  symbolic_hintr[  r   r   r  r  r
  r  r  r   rE  r   r   r   r   r   persistent_reductionr  r   ro  rf  r   numel_get_persistent_RBLOCKsubsstatically_known_geqreplacer   r5   r  nsolve	max_blockr  fixed_configr{   tma_min_block_sizesr  r  )r  block_paramsstrH  element_sizers  innermost_block_shapeinnermost_block_typeinnermost_block_symtblock_type_str
block_symtinnermost_tree_prefix
tree_numelr  persistent_rblockinnermost_block_bytesr  
solve_exprsolve_expr_simplifiedmin_block_sizes                       r   are_block_parameters_compatible7TMACompatibilityChecker.are_block_parameters_compatible  s    ::=I=Q=Q=Qr  ..r2=Q  G #**G ww77U]]STEUVVIIO((
 zz**crlF77##;; 5q%--:KLa   		k,, 	  #" !- 8 8 < 7733!5==#3
 
 IIP((((
 ##3@@N+77
!.==+9(+5(	 8 A $(< 	
$%%QR_RkRkQlm	
<
 ;;+++DNNN %//C$D!J[[,,>>>xx#88%&WW
	 -
 ))) $ B B: N%**,@+TU " 77##88%u}}R'8  		 A,, ,,%	 f IB
 /3!! ,  	 3ABF
(2(:(:.)'/+;< & "110" "DKK$9$934%  II]00-&	 !!%!9!9:N!O;;++%(@(@(PP		< 44* KK44^D*  % Q4  GJ&77;;NANGDKK33NC Gv  		U,, ,,
 s&   3TB2T BT AT 5UUc                    U R                   $ )a  
Can you lift the make_tensor_descriptor
call to the top of the kernel? This requires
being certain that all of the shape, stride,
and block_shape information is handled in arguments
or top level definitions.

Right now we assume this is always possible if you force TMA.
)r  r  s    r   rA   TMACompatibilityChecker.can_lift	  s     zzr   r	  Nr/  )r%  r4  r   r   )r   r   r   r   r   r   r  r  r4  rA  r   r   r   r   r  r    sS     OKX!	!Fm%m 
m^
r   r  c                    ^  \ rS rSr% Sr\rS\S'   \r	S\S'   Sr
\rSrS	\S
'        S\         S]U 4S jjjr\S^S j5       r\S^S j5       rS_S jrS^S jrS rS rS rS rS^S jrS r\S`S j5       rSSSSSS.     SaS jjr Sb       ScS jjrSbS jr        SdS jrS r Sr!Sr"S  r#SS!.S" jr$S# r%SeS$ jr&    SfS% jr'SgS& jr( Sh         SiS' jjr)SjS( jr*S) r+SkS* jr,  Sl               SmS+ jjr-S`S, jr.SnS- jr/      SoS. jr0          SpS/ jr1  SqS0 jr2SqS1 jr3S2 r4S3 r5S4 r6S5 r7  SrS6 jr8      SsS7 jr9      StS8 jr:        SuS9 jr;          SvS: jr<SeS; jr=S< r>SwS= jr?SxS> jr@S? rAS@ rB\SA 5       rCShS`SB jjrD\SC 5       rE\SD 5       rFSE rGSySF jrHSG rI Sz     S{SH jjrJSjSI jrKS|SJ jrLS}SK jrMS~SL jrN      SSM jrOS~SN jrPSSO jrQSSP jrRSSQ jrSS^SR jrTSSS jrU\VSwST j5       rWSSU jrXSSV jrY\VSSW j5       rZSSX jr[SSY jr\      SSZ jr]S[r^U =r_$ )r  i	  z\A class to represent a triton kernel and helpers to generate
triton kernel programmatically
r{  helper_functionszCallable[[sympy.Expr], str]kexprTNzOptional[bool]3transpose_discontiguous_tensor_descriptors_overrideFc                  > X0l         X@l        X`l        [        TU ]  " U40 UD6  [        U R                  U R                  5      U l        0 U l	        [        5       U l        [        5       U l        [        5       U l        [        [           " 5       U l        X l        [$        R&                  " 5       U l        [*        [,        [,        4   " 5       U l        [1        5       U l        [4        R6                  " [*        5      U l        [*        [,        [:        4   " 5       U l        XPl        [4        R@                  " 5       U l!        SU l"        SU l#        [        [H           " 5       U l%        S U l&        U RN                  (       a  U RQ                  U RR                  5        U RT                  (       a  U RW                  5         U RY                  5         U RT                  (       a  U R[                  5         SU l.        / U l/        g )Nr   F)0optimize_maskr#  is_combo_kernelr  r  r  newvar_prefixsuffixr   prologue_cacherV   prologuepost_loop_combinepost_loop_storer   r	   outside_loop_varsr  r  countblock_ptr_iddictr   block_ptr_to_bufferr{  r9  r  r  pointer_advancementsr  r$  hint_overrideCounter_load_counts_pdl_load_index_pdl_has_waitr/   autotune_hintstriton_metarc  codegen_reduction_numelsrj  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_maskhas_load_with_contiguous_rdimstores_with_contiguous_rdim)	r  tilingr  r=  r#  rK  r>  r  r  s	           r   r  TritonKernel.__init__	  sx    $1(%4*6*T//=.0(6(81?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! $(S>#3 *6A6I6I6K " )6859  ))$))4%%++-!%%002-2*68(r   c                   [         R                  (       d   [         R                  R                  (       d  gU R                  nU Vs/ s H&  n[        U[        R                  5      (       d  M$  UPM(     nn[        U5      S:X  a  g [        R                  R                  R                  XU5      n[        S U 5       5      $ s  snf ! [         a     gf = f)NFr   c              3  *   #    U  H	  oS :H  v   M     g7fr  r   )r   rs  s     r   r  4TritonKernel._has_stride1_on_rdim.<locals>.<genexpr>
  s     9[6Q;[   )r"   deterministicrL  force_filter_reduction_configsr   r   r   r   r   rL   rZ  rR  stride_varsZeroDivisionErrorrz   )r  support_varsr   reduce_varsra  s        r   _has_stride1_on_rdim!TritonKernel._has_stride1_on_rdim	  s       F$7$7$V$V)) $
#c=#@#@A # 	 
 {q 	''**66u<XK 9[999!
 ! 		s   #C-C*C 
CCc                D    [        S U R                   5       5      (       + $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   )rX   )r   r   s     r   r  >TritonKernel.has_store_with_contiguous_rdim.<locals>.<genexpr>
  s      
0Pd##0P   )r\  rX  r  s    r   has_store_with_contiguous_rdim+TritonKernel.has_store_with_contiguous_rdim
  s'     
040P0P
 
 
 	
r   c                    [        U5      $ r   )rF   )r  r  s     r   dtype_to_strTritonKernel.dtype_to_str
  s    5!!r   c                z    U R                   =(       a)    [        R                  R                  U R                  5      $ r   )rc  rL   choices should_use_cooperative_reductionre  r  s    r   rr  -TritonKernel.should_use_cooperative_reduction
  s-    $$ 
)S)SMM*
 	
r   c                6  ^  T R                   (       d   eT R                   H'  nUR                  c  M  U=R                  S-  sl        M)     T R                  S   nT R                  (       a  [        UT R                  S   5      nT R                  R                  U5      T l        [        T R                  5      T l
        T R                  R                  S5        [        U 4S jT R                   5       5      (       a  T R                  R                  S5        gg)z/One time setup code for cooperative reductions.NrN   rX  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  v   >#    U  H.  nUR                   (       d  M  TR                  U5      (       + v   M0     g 7fr   )rf  r,  r   r   r  s     r   r  :TritonKernel.init_cooperative_reduction.<locals>.<genexpr><
  s4      
(   .''---(s   99z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))rS  ro  grid_dimrd  r#  r   r  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacherj  r   rz   r   )r  r   	sem_counts   `  r   rT  'TritonKernel.init_cooperative_reduction 
  s    )))) $$D}}(" % KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 
 

 IIP
r   c                .   SnU R                   (       d  U S3nU R                  R                  SU 35        U R                  5       (       a  U R                  R	                  S5        g U R                   (       a   eU R                  R                  S5        g )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r`  rj  r   _has_constant_xmaskr   )r  rsplit_aranges     r   rV  ,TritonKernel.init_cooperative_reduction_maskE
  s~    >}},oY7M		.}o>?##%%II }}$$IIer   c                Z   U R                    H}  nUR                  (       d  U R                  XR                  5        M1  U R                  (       d  MD  U R                  R                  UR                   SU R                  U5       35        M     U R                  (       a  [        S U R                    5       5      (       aP  U R                  SSSS9nU R                  U5      nU R                  R                  SU R                  U5       35        g U R                  U R                  5        g g )Nzbase = c              3  8   #    U  H  oR                   v   M     g 7fr   )is_loopr   r   s     r   r  2TritonKernel.codegen_range_tree.<locals>.<genexpr>g
  s     =,<D<<,<rj  baseTr   zrbase = )ro  r  iteration_ranges_codegen_headerrj  rc  r   r   iteration_ranges_ranges_coderz   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r  r   rn_basesrbases       r   rU  TritonKernel.codegen_range_treeZ
  s    $$D<<44T99E&&& 		##{{m74+L+LT+R*ST %   =D,<,<===66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r   c                    g)z
Indicate whether we need provide numel as arguments for the generated
kernel calls in the benchmark.

Should be true for pointwise/reduction kernels but false for triton
matmul kernels.
Tr   r  s    r   need_numel_argsTritonKernel.need_numel_argsr
  s     r   c                    U R                   =(       a4    [        R                  R                  U R                  U R
                  5      $ r   )rc  rL   rq  should_use_persistent_reductionre  rS  r  s    r   r  ,TritonKernel.should_use_persistent_reduction|
  s5    $$ 
)R)RMM455*
 	
r   c                    U R                   =(       aQ    [        U R                  5      U R                  S-   :H  =(       a%    U R                  =(       a    U R                  S   S:H  $ )NrN   r   )r  r   rd  rb  r#  r  s    r   want_no_x_dimTritonKernel.want_no_x_dim
  sY    %% 1DKK D$;$;a$??1!!1 !!(+q0		
r   c                    g)Nztl.device_assertr   r  s    r   assert_functionTritonKernel.assert_function
  s    !r   )
copy_shapedense_indexingoverride_maskrF  rG  c          
       ^ ^^^^!^"^#^$ T R                  T5      mTR                  nSn[        5       m![        U[        R
                  " S5      S9 GH  n	[        U	[        R                  5      (       d   eU=(       d    [        U	[        R                  5      nU(       a  MQ  [        U	[        R                  5      (       a@  T R                  R                  U	R                      n
T!R#                  U
R$                  5        M  [        U	[        R&                  [        R(                  [        R*                  [        R,                  [        R.                  [        R0                  45      (       a  GM  [        R2                   Vs/ s H  n[        X5      (       d  M  [4        U   PM      nn[7        U5      S:X  a   [7        U5      S:X  d   SU	R                    35       eT!R9                  US    S35        GM     [:        R<                  R>                  =(       d    U=(       d    T R@                  SL=(       a    TS:g  nS	nSn[        5       nT RC                  5        HF  nURE                  URF                  5      (       a  S	nOSnUR9                  URH                   S35        MH     U(       a0  T RJ                  (       a  [:        R<                  RL                  (       d  T(       a  TRO                  5       (       a  U(       d  T R@                  (       d  [7        T!U-
  5      S:X  an  T RQ                  T5      (       dX  U(       aQ  T RR                  S
:X  aA        SS jm"      SU 4S jjm$      SU"U$4S jjm#SUUU!U#U U4S jjnU" 5       nUb  U$ SnSnT RU                  T5      nUU 4S jn[W        T5      (       Ga
  T(       d  [7        T RY                  5       5      S:X  a  U" 5       u  nnOL[[        S/[7        T RY                  5       5      -  5      n[]        S/[7        T RY                  5       5      -  5      nSU SU S3nT R^                  (       d  T R`                  (       a   [        U 4S jT Rb                   5       5      m!O
[        5       m!T R@                  (       a  T!R9                  T R@                  5        [e        UT!UUTUS9$ U(       Ga  U(       Gd  T Rf                  (       Ga  T Rh                  (       Ga  T!Rk                  5       nT R@                  (       a  UR9                  T R@                  5        [        / SQ5      nURm                  U5      (       d|  URo                  U5      nURq                  5       n[        U[r        5      (       d   eURu                  U5        UR#                  UR$                  5        URm                  U5      (       d  M|  S/[7        T RY                  5       5      -  nU H  n[        U[Z        5      (       d   eT RC                  5        H^  nURw                  URH                  5      (       d  M%  URx                  n[        U[z        5      (       d   eT RY                  5       U   UU'   M`     M     SSR}                  [        [Z        U5      5      -   S-   n[]        U5      nSU SU S3nO9U" 5       u  nnSU SU S3nUm!O#U(       d  T(       a  U" 5       u  nnSU SU S3nUm!Uc  U(       d  U(       a  U" 5       u  n nOSnU(       a  [        U/5      m!T R@                  (       a  T!R9                  T R@                  5        T R                  T!5        [e        UT!UUTUS9$ s  snf ) z?
Compute the index and mask to pass to tl.load() or tl.store()
Fr   r  r   rN   r   r  NTtl.int32c                    [         R                  " XR                  5       5      nUc  g[        UR                  /[
        R                  U5      /U/[
        R                  U5      /S9$ )zg
Matches expressions of the form:
    idx = s * xindex

This implies stride (s,), and shape (XBLOCK,).
Nr   rE  rH  rL  )rO   match_affine_block_exprsymbolr4  r  r   r   r   )r  
range_treers  s      r   match_affine_block1TritonKernel.indexing.<locals>.match_affine_block
  sj     -DD,,. >&%++,!.!=!=j!I J#H*;;JGH	 r   c                  >^^ UR                  5       n[        R                  " S[        R                  " [        R
                  U/S9S9u  p4[        S[        UR                  5      U R                  [        X#5      5      U R                  [        X#U5      5      -   5      n[        R                  " XUR                  U5      nUc  gUu  nnn	[        R                  " U5      n
[         R"                  R$                  mTR'                  UR(                  5      m[+        UU4S jU
 5       5      (       a  g[,        R/                  U5      n[1        XS   5      /[3        U
SS USS 5       VVs/ s H%  u  p[        R4                  " [1        X5      U5      PM'     snn-   nU	 Vs/ s H#  n[7        X[,        R9                  U5      05      PM%     nn[;        UUUUS	9$ s  snnf s  snf )
a  
Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

Example expression to match:
   sN * ((rindex//(d1 * ... * d(N-1))))
       + s1 * ModularIndexing(rindex, 1, d1)
       + ...
       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

This iterates over a block of shape (dN, ..., d1) and stride
(sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
wildcards that we match.

Note that dN does not appear in the expression, but we solve for it
using range tree numels and the other dims.
zdenom modulo)exclude)r   r!   Nc              3     >#    U  H9  nTR                  UT5      (       + =(       a    TR                  U5      (       + v   M;     g 7fr   )r  statically_known_power_of_2)r   r  r"  rR  s     r   r  ETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>;  sG       ". !==eYOO H$@@GGH!-s   AAr   rN   r  )r  r   symbolsr;  r<  Wildr{   r   r^  rF  r   r   rO   match_mod_div_block_exprr  get_slice_numelsrL   rZ  rR  r"  r   rz   r   r   r   r  MinrE   r   r4  )r  r  rP  denommodulonum_dimsmatch_resultdimsrH  block_index_exprsslice_numelslinear_block_sizer  rq  rE  r   r   r"  rR  r  s                    @@r   match_mod_div_block2TritonKernel.indexing.<locals>.match_mod_div_block
  s   ( '--/	 !&"!))%**ykJ!  
(()HY$>?++oi&OPQ	  3KKj&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ".  
   %2$@$@$L!-A?1 '*,qr*:DH&E&E
 IIg&7?E&E1 !2	3 !2 -*H*H*TU !2	  3 ' +#)	 3s   ,,G"*G c                :   > TT4 H  nU" X5      nUc  M  Us  $    g)zE
Match a block indexing subexpression involving a single range tree.
Nr   )r   r  
match_funcmatchr  r  s       r   match_block_subexpr2TritonKernel.indexing.<locals>.match_block_subexpr^  s3     ''#J 't8E($# r   c            
       > [        TTR                  R                  5        V Vs0 s H  u  pXR                  _M     snn 5      nTR	                  5       nU Vs/ s H'  n[
        R                  " X$R                  5       5      PM)     nn[        S U 5       5      n[        5       n[        X55       H@  u  pH[        UR                  UR                  5      5      S:  a    g T" X5      n	U	c    g Xy-  nMB     U[        U5      -
  n
TR                  T5        [         R"                  R$                  (       a  [&        O[(        n[         R"                  R$                  (       a  Sn[        R*                  nO[-        [.        T5      mTR1                  5       nTR2                   b  TR2                  nO[         R"                  R4                  n[7        TS5      (       a  UTS L-  nU(       a  [        R8                  O[        R*                  nUR;                  UU
UTTR<                  UUS9nU[(        :X  a1  [-        [.        T5      mTR?                  UR@                  5      (       d  g U$ s  snn f s  snf )Nc              3  @   #    U  H  oR                  5       v   M     g 7fr   )r  r  s     r   r  BTritonKernel.indexing.<locals>.match_block_expr.<locals>.<genexpr>  s     *Q[T;;==[   rN   Ftemplate_out_shape)r5  r6  ro  r  rp  rA  rV  )!rE   range_tree_nodesr  r   r   rO   get_subexpr_involving_symbolr  r   r4  r  r   intersectionr   sumfilter_masksr"   r   use_block_ptrr  r  r  r
   r  rA  r;  )transpose_discontiguous_tensor_descriptorr   r  rv  r"  r4  r5  )r0  r  index_relative_to_xyr_indexro  r   index_subexprsrange_symbolsr%  subexprr5  r   options_classrA  rV  transpose_contiguousoptionsr  r  r  r  r  rG  s                   r   match_block_expr/TritonKernel.indexing.<locals>.match_block_exprn  sa   .8$2G2G2M2M2OP2O$!AvvI2OP/+ #557 !,	" !, (DD3[[] !,	  " !+*Q[*Q Q.0%(%EMD =55g6J6JKLqP# 1?F~# *L &F 5s>7JJ !!), }}22 $0  ==..$H(7(L(L%04/1J1-  9AACH PP#$ !TT -
 #MMSS - t%9::,
$0FF, 0 (CC,AA & (..'$* +'"&..%&7 /  !$;;04/1J1- 5TT   ${ Q"s   I .I&c                    > T (       a<  [        T [        5      (       a  T  S3S 4$ SSR                  S T  5       5      -   S-   T 4$ TR                  5       [	        TR                  5       5      4$ )Nz.shaper  r  c              3  8   #    U  H  n[        U5      v   M     g 7fr   r   )r   r7  s     r   r  ATritonKernel.indexing.<locals>._get_expand_str.<locals>.<genexpr>  s     *F:a3q66:rj  r  )r   r   r*  dense_size_strr   rK  )r  r  s   r   _get_expand_str.TritonKernel.indexing.<locals>._get_expand_str  sk    j#..(\0$66*F:*F!FFLjXX**,eD4H4H4J.KKKr   r  r  z, tl.int32)c              3     >#    U  H?  nUR                   (       a  M  TR                  U5      (       a  M.  UR                   S 3v   MA     g7fr  N)rf  r,  r   rv  s     r   r  (TritonKernel.indexing.<locals>.<genexpr>  sA      ' 0,, )595L5LT5R )t{{m4( 0s   A
A
A
)r	  )xmaskymaskzmaskr0_maskr   r  ,r  r  r  r   )r  r   r  rc   r   Optional[BlockParameters])r   r   r  rc   r   r  )r   z Optional[BlockDescriptorOptions])Aprepare_indexingr   r   r+  operator
attrgetterr   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r   r   r   r   r  r"   r   r  r  r   r  var_listr   allow_block_ptrr  r  is_indirect_indexingr  r  r   rK  r   r   r#  r>  ro  r  rc  rJ  copyissubset
differencera  r  r]  r  r   r  r*  r,  r  )%r  r  r  r  r  rF  rG  
index_varsr  r   r   r   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  r  r  r	  r  r  
mask_shapexyzr	tmp_masksr  expand_listr  rq  expand_shape_strr\  r  r  r  r  s%   ```   `                          @@@@r   rI  TritonKernel.indexing
  s    %%e,''

%/\	*(*=*=f*EFCc5<<0000# ~]22(J TXX..((..sxx8  !2!23%%II))JJJJ''
 
 
 !. 9 9" 9%c0 %Jt$ 9  "
 ~&!+>*a/N3CCHH:1NN/!2 3489C GH MM(( ++d* qj	 	 
+5<++-D&&t}}55!%"
4;;-t 45 . t338S8S-1==?? "OOI/0A5--e44  J.!/B*,b!b/Bb*bH .A*  _ _D '(G"
'+%%e,		L !'' S!5!5!78A=+:+<(
L !s4+?+?+A'B!BC
$aS3t/C/C/E+F%FG":,b;GI  D$8$8& ' $ 0 0' 	 'L	doo.")  j$$$)>)>)>. '^^-
??NN4??3!"HI$--d33 * 5 5d ;I#--/C%c+<====&&s+%%cmm4 %--d33  #ec$*>*>*@&AA&D%dC0000 $ 7 7 9??4;;77"&//C#-c3#7#77#7/3/C/C/Ec/JK,	 !: ' !388C[,A#BBSH
$[1.ykJ<qI	+:+<(
L.ykJ<qI	+	J-<->*l*9+R8H7IKI'IZ"1"3<!"M?3I??MM$//*)$%
 	
m"s    ]1]1c                Z   UR                  5       n[        U[        5      (       a  U(       a  U(       a
  US:X  d   eSnO'U(       d  SnOU(       a  US:X  d   eSU< S3nOSU< 3nU R                  (       a3  U R                  S   R
                  (       a  UR                  5       (       d  UR                  (       Ga  UR                  (       a!  X R                  ;   a  U R                  U   nXd4$ UR                  USS9nU R                  R                  U5      nU(       a  [        U5      U4$ [        U R                  5      n	[        U[        5      (       a  SU	 3nOS	U	 3nU R                  R!                  U["        R$                  / S
9n
U R                  R'                  Xz5        [)        X SU 35      nUR                  (       a*  U R*                  R-                  U5        X`R                  U'   OU R.                  R-                  U5        [        U[        5      (       au  XR0                  U'   [2        R4                   HS  nUR7                  U5      n[9        S U 5       5      (       a  M-  U R:                  U   nXn;  d   SU SU S35       eXU'   MU     Xd4$ UR                  U5      nXd4$ )a%  Generate a block pointer or tensor descriptor for Triton kernel operations.

This method creates either a block pointer (for regular Triton operations) or
a tensor descriptor (for TMA operations) based on the indexing type. It handles
caching and reuse of descriptors for performance optimization.

Args:
    name: The name of the buffer/tensor being accessed
    var: The variable name for the pointer
    indexing: Block pointer options or tensor descriptor options containing
             indexing information and boundary check settings
    other: Additional parameters string (e.g., padding options)

Returns:
    A tuple containing:
    - block_descriptor: The generated block pointer or tensor descriptor variable name
    - other: Modified additional parameters string with boundary check options
, other=0.0r   , boundary_check=z, padding_option='zero'rU  F)r}  rF  tma_descriptorr5   = c              3     #    U  HC  n[         R                  R                  R                  U[        R
                  " S 5      5      v   ME     g7fr   N)rL   rZ  rR  r[  r   r   )r   r   s     r   r  1TritonKernel.codegen_block_ptr.<locals>.<genexpr>  sC       +: GG,,DD &a(8  +:s   AAz#duplicate advancement for pointer 'z' at type 'rC  )r  r   r  rc  ro  r  r  rA  rA  r  r   rk  r   nextrG  r  namedvarr   uint64rm  rU   rB  r   rj  rI  r   r   r  r\  rJ  )r  r   r   rI  rk  checkblock_descriptorblock_ptr_line	block_varblock_descriptor_id	named_var	line_bodyr   advance_offsetsadvancementss                  r   codegen_block_ptrTritonKernel.codegen_block_ptrW  s   2 '')h 788 ------+E94KL+E95 !!  $,,##%%  S,?,?%? $(#6#6s#; h  &&e "*e!D HH,,^<	 y>500&*4+<+<&=#h88)23F2G'H$)78K7L'M$ HH--$ELL . 	 ^7(1C3~FV/WX	$$MM++I6/?'',II''	2h88 BF,,-=> !. = =*2*B*B4*H   +:	   %'+'@'@'F/C ABRASS^_c^ddefC :I%56! !>&  &&  (s3&&r   c                ^   SU SUR                    S3n[        [        UR                   UR                  5      5       HG  u  nu  px[        R
                  R                  R                  Xx5      (       d  M8  SUR                  U'   MI     UR                  UUR                   UR                  SSS9nU S[        [        R
                  R                  U5      5       S3n[        U[        5      (       a  SU SU U S3$ U S	[        R                  R!                  UR"                  5       SU S3$ )
Nr  r  r  FTr  r  r  	tl.store(z.store()r=  r  r  r:  rL   rZ  rR  r[  r<  r  rE  r  	get_dtyper   r  r   r  rL  )	r  r   rI  rF  r  rk  r  rq  broadcast_dims	            r   codegen_block_ptr_store_line)TritonKernel.codegen_block_ptr_store_line  s-    #5'H,@,@+AC *3$$h&>&>?*
%C%# ww77KK27**3/	*
 66      7 
 '/0A0A$0GHIKh00ykE75';;GAHH$9$9(:J:J$K#LBugUVWWr   c                   U(       d  U(       d  g [        U[        R                  5      (       d   eU R                  USS S9n[        U[        5      (       d   eUR
                  nUR                  5       (       a  UR                  OS nU(       a  [        U R                  U5      5      OS nU R                  Xc(       a  SOS X5      n	U R                  U5      n
U R                  R                  XS[        R                  S9  g )NFrE  0)
assignmentr  )r   r   r   rI  r  r  r  r-  texprrename_indexingindirect_assertget_load_bufferr   r9  r   r  )r  r   r  lowerr   rI  r  r-  size_strlinebuffers              r   check_boundsTritonKernel.check_bounds  s     $

++++==RV=W(O4444&&	(0(9(9(;(;8$$8=5--d344 ##esx
 %%h/&5Lr   c                &   UR                  5       (       d  UR                  5       (       a  U R                  $ U R                  (       a?  U R                  S   R
                  (       a!  UR                  5       (       d  U R                  $ U R                  $ )NrU  )	r  r   r:  rc  ro  r  r  rj  loads)r  rI  s     r   r  TritonKernel.get_load_buffer  sk      ""h&:&:&<&<<<!!  $,,'')) 99::r   ztl.extra.cuda.gdc_wait()z%tl.extra.cuda.gdc_launch_dependents()c                   [         R                  R                  R                  R                  (       d  g[        [        R                  [         R                  R                  R                  5      (       a  g[         R                  R                  (       a  g[        R                  R                  5       R                  S:H  =(       a$    [         R                  R!                  5       S   S:  $ )NFrM  r   r  )r   r9  r"   r   
enable_pdlr   rL   r   select_algorithmTritonTemplateKernelr  r  rZ  r  rF  rM  r  r  s    r   _enable_pdl_codegen TritonKernel._enable_pdl_codegen  s    %%,,77ahh @ @ U UVV==GG//166&@ ;

00215:	
r   consider_readsc                 ^^^^ U R                  5       (       d  g [        R                  R                  m[        R                  R
                  b$  [        R                  R
                  R                  OS mUUU4S jmU(       d   eTS L =(       d    [        U4S jU 5       5      nU(       d  g U R                  (       a*  U R                  S   R                  (       a  U R                  nUR                  U R                  5        g )Nc                   >^  Tc   eTR                   R                  nT(       a+  [        R                  " UTR                   R                  5      n[        UU 4S jU 5       5      $ )Nc              3     >#    U  H8  nTTR                   R                  UR                  UR                  5      :H  v   M:     g 7fr   )mutation_renamesr  r   )r   wcurrent_nodedeps     r   r  OTritonKernel._handle_pdl_before_access.<locals>.matching_dep.<locals>.<genexpr>$  s8      "A |4488HH"s   A A)read_writeswritesr  r  readsrz   )r7  	prev_depsr0  r6  	prev_nodes   ` r   matching_dep<TritonKernel._handle_pdl_before_access.<locals>.matching_dep  sY    (((!--44I%OOIy7L7L7R7RS	 "  r   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   dr>  s     r   r  9TritonKernel._handle_pdl_before_access.<locals>.<genexpr>*  s     ,Sl\!__lr  rU  )r-  rL   r   r6  rZ  	schedulerprevious_noderz   rc  ro  r  rj  r   GDC_WAIT)r  wait_bufferr0  dependencies	need_waitr6  r>  r=  s     `  @@@r   _handle_pdl_before_access&TritonKernel._handle_pdl_before_access  s     ''))xx,,/0ww/@/@/LAGG++RV 		 |%S,Sl,S)S	  T%5%5b%9%A%A))Kdmm,r   c                4   U R                  5       (       d  g UR                  S:  a  g U R                  (       a*  U R                  S   R                  (       a  U R
                  nUR                  U R                  5        UR                  U R                  5        g )NrN   rU  )	r-  	use_countrc  ro  r  rC  r   rE  
GDC_LAUNCH)r  launch_buffer
result_vars      r   _handle_pdl_after_load#TritonKernel._handle_pdl_after_load3  sr    ''))!#  T%5%5b%9%A%A 22M 	.0r   c                @   / nSnS nUR                    H  n[        U5      [        L a  U R                  U;   a  U(       a  M.  Sn[        U5      [        L a/  U R                  U;   a  Ub  UR                  U5        [        U5      nUR                  U5        M     X!l         g )NFT)_linesrF  r   rE  rM  ra  r   r  )r  code	new_lineshas_waitprevious_launchls         r   _filter_pdlTritonKernel._filter_pdlC  s    	AAw#~$--1"4#HAw#~$//Q"6".MM/2"%i.Q   r   c                N    U R                   R                  [        XU5      5        g r   )saved_partial_accumulater  rd   )r  r   r   r  ru  s        r   rv  TritonKernel.partial_accumulateT  s"     	%%,,dC8	
r   c                B  ^^^^^ U R                   R                  T5      nU R                  mTT==   S-  ss'   [        nU R	                  U5      mUn[
        R                  R                  T5      nU R                  USU R                  U USSS9S9n[        U[        5      (       a'  U R                  UR                  5      (       a  SU l        UR                  5       mUR!                  5       n[#        S U R%                  U5      R'                  5        5       5      n	U R)                  U5      (       a  Sn
OiU	(       d  Sn
O_U R*                  (       aL  U R,                  S   R.                  (       a.  UUUUU4S	 jnTT   mS
n
[0        R2                  " [4        SU5      nOSn
U(       d  T(       aB  UR7                  5       (       a-  U R8                  (       a  S[;        U R8                  5       3nOSnOSn Sn[<        R>                  R@                  (       a"  U RB                  RE                  5       nUT   S:  n U R)                  U5      (       + =(       a(    U R*                  (       + =(       a    U(       + =(       a    U	nSnU(       a  SnSnSn[G        T5      (       al  UnU[H        RJ                  [H        RL                  4;   aB  [<        R>                  RN                  (       a  [H        RP                  nOUS[S        U5       S3-  nSnGO[        U[T        [V        45      (       a  U RY                  TX7U5      u  nn[        U[T        5      (       a  SU U U
 U S3nO/U S[
        RZ                  R]                  UR^                  5       S3nURa                  UURb                  URd                  SSS9nURd                  nO[g        U5      (       a  SU SU S3nURh                  nSnOcSU SURj                   SURl                   U
 U U S3
nURn                  (       a  URn                  nO[p        Rs                  UR                  5      nU[H        RJ                  [H        RL                  4;   a4  [<        R>                  RN                  (       a  US-  n[H        RP                  nU[H        Rt                  :X  a0  [H        Rv                  Rx                  c  US-  n[H        Rt                  nU R{                  U5      nU R}                  UT5        U R~                  R                  UU" U5      UUS9nU R                  UU5        UR                  S:  a  TT==   S-  ss'   [        U[        5      (       d   eUR                  UlD        U(       a  SU SU S3nU R~                  R                  UUXgRn                  S9nUR                  (       a  UR                  (       a  SnOU[H        Rt                  :X  a  S nOS!nU R8                  (       a  [;        U R8                  5      OUnS"URl                   SU SU S3nU R~                  R                  UUUUR                  S9nU R*                  (       a  UR                  5       (       d"  T(       d  U R                  R                  U5        U$ )#zS
Load from the memory location 'name', offset by some indexing expression 'index'.
rN   TFr  r  rE  c              3  *   #    U  H	  oS :H  v   M     g7fr  r   )r   rZ  s     r   r  $TritonKernel.load.<locals>.<genexpr>  s      
MqFMr^  z, eviction_policy='evict_last'rU  c                 6   > TT   T :  a  T(       d  T(       a  gg)N
evict_lastevict_firstr   )expected_countr  indirect_indexingload_countsr   s   r   decide_later'TritonKernel.load.<locals>.decide_later  s    t$~5"3'$r   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'Nr  r  r   rd  z.load(r  r  rE  r  r  rS  r5  r  r  z0.0Truer  r5  )Jr  rg  rM  r   r  rL   rZ  r  rI  tma_compatibility_checker_clsr   r  re  r  rW  r  r   rz   get_strides_of_loadr  is_broadcastedrc  ro  r  r;  r<  r<   r  _load_otherr`   r"   r   skip_l1_cachere  buffer_read_countsrj   r   r  r  r  r   rF   r  r  r  r   r  rL  r  rE  r=  r   r  r  r-  r	  r   r   r   r  r  r  rI  r   r9  rP  rL  r  r  r  r   r&  rE  r  )r  r   r  r   	make_lineoriginal_indexr  rI  r   is_coalescedeprh  rk  has_read_depsrp  ro  cachemodappend_broadcastr   r"  r  load_bufferrO  zero	other_valre  r  rf  rg  s    `                       @@@@r   rV  TritonKernel.load[  s    iiood#''DQCK	 55e<!!$'==&*&H&H	 'I ' ! 	
 h00T5N5NNN6
 6
 26D.((*
**,  
 44^DKKM
 
 ~..1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H $#D))D 77==77!MMEd;u#5"6a88DE (_6M$NOO*.*@*@#+' % h88%&6%7wrd8*AND./vahh6K6KHL\L\6]5^^_`D==((((#'# >  !,,&~66!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk (($11E)99(..IE %--88MM88))

"u}}'8'8'@ &

**84&&{D9XX&&4U ' 

 	##K<!#"*&78888'11
%j\4D3EQGD**T6K6K + J !!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..U*:J:J / 
 $$X-?-?-A-A*""&&z2r   c           	        U R                   R                  U5      nUn[        R                  R	                  U5      nSnUb  US:X  a  US:H  n	U R                  U USU	S9nU R                  USUSL US9n
[        U
[        5      (       a;  U R                  U
R                  5      (       a  U R                  R                  U5        XR                   R                  ;   nU R                  U5      nU(       a,  U(       a%  U R                  R!                  [#        US5      5        [        U
[$        [&        45      (       a)  U R)                  XU
5      u  pU R+                  XXU5      nGOJUc  U
R,                  n[/        U5      (       a`  UR0                  bS  [3        S UR0                   5       5      (       d2  SR5                  [7        [8        UR0                  5      5      nUS	U S
3-  nSU SU SU SU
R:                   S
3	nOUS:X  a  SU l        U
R,                  n[/        U5      (       a`  UR0                  bS  [3        S UR0                   5       5      (       d2  SR5                  [7        [8        UR0                  5      5      nUS	U S
3-  nSU SU SU SU
R:                   S3	nO[?        SU 35      e[@        RB                  " 5       nU RD                  (       d;  U RF                  (       a*  URI                  U RK                  XR                  5      5        U RM                  U R                  USS9  U R                  R!                  [#        X5      5        U RD                  (       d  U RN                  RQ                  U5        URS                  5         g)z^
store the 'value' to the memory location 'name', offset by some indexing expression 'index'.
NtmaTr_  )r  rF  rG  ztl.debug_barrier()c              3  >   #    U  H  n[        U5      S :H  v   M     g7fr   Nr  r?  s     r   r  %TritonKernel.store.<locals>.<genexpr>R       ?;aCFcM;   r  .broadcast_to(r  r  r  r  
atomic_addc              3  >   #    U  H  n[        U5      S :H  v   M     g7fr  r  r?  s     r   r  r  ]  r  r  ztl.atomic_add(z, sem='relaxed')zstore mode=r/  )*r  rT  rL   rZ  r  rk  rI  r   r  re  r  rX  r  inplace_buffersrm  storesr   rU   r  r  r  r  r  r   r   r\  r*  r,  r   r-  atomic_add_foundrA  
contextlib	ExitStackrc  rS  enter_contextguard_cooperative_storerI  rE  r  close)r  r   r  r  moder   rr  r  rG  r  rI  
is_inplacerm  r  rk  r"  indexing_strvalue_shape
exit_stacks                      r   storeTritonKernel.store  s'    iit$!!$'$(!<45=EME(,(J(J	 )K )% ==dl&?	 ! 
 h00T5N5NNN6
 6
 ,,33D9 YY666
,,^<.KK!!,t5I"JKh2I JKK&*&<&<T&Q#44 0D \ $--L%e,,KK+?5;;???"iiC(=>.Q ??se4~Sr(BSBSATTUVD\!$(D!#--L%e,,KK+?5;;???"iiC(=>.Q ??#C5\N#eWBxGXGXFYYijD%D6&:;;))+
$$)C)C$$T%A%A$%TU&&t{{D&Nl467$$""&&u-r   c                Z    U R                   R                  SU S[        U5       S35        g )Nztl.device_assert(r  r  )r:  r   repr)r  condmsgs      r   device_assert_async TritonKernel.device_assert_asyncq  s(    !24&49+QGHr   c                    U R                   R                  5       nUR                  [        USU S35      5        UR	                  5       $ )z
For cooperative reductions only one thread block should write out the result.
We rotate which thread block does each write for better parallelism
zif rsplit_id == (z % RSPLIT):)r{  r  r   rU   indent)r  r   r#  r  s       r   r  $TritonKernel.guard_cooperative_storet  sC    
 88NNPd.?uK,PQR}}r   c                    S nU H:  nUc  M  [        US5      (       d  M  Uc  UR                  nM,  X#R                  -  nM<     U$ )Nr  )r   r  )r  	variablesmaskselems       r   _combine_masksTritonKernel._combine_masks}  sG    D|t[))= NNE!NN2E  r   c                p   U R                   R                  [        R                  5        U R                  R                  US   5      nU R                  US   5      n	U R                  US   5      n
U R                  US   5      nU(       a  U R                  R                  US   5      OSnU(       a  U R                  US   5      OSnU[        R                  :X  a  SnO"U[        R                  :X  a  SnO[        S5      eU R                  " U R                  US   /U(       a  US   /O/ Q76   U R                  R                  U R                  S	U S
U S
U	 S
U
 S
U S
U S
U S
U S
U S
U S
U S3UUR                  S9nU R!                  U R                  U5        U R#                  XU5      nUUl        U$ )z#
See [Note: Inductor bucketize op]
r   rN   r!   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r  z, )r5  )rP  r  r/   ONE_ELEMENT_PER_THREADr  rg  r  r   r  rK  rA  rI  r:  r   r9  r   rP  r  r  )r  r  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyperu  r  s                    r   	bucketizeTritonKernel.bucketize  s   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  	&&LL*Q-	
:@F1I;b	
 ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  !,, # 
 	##DLL&9##FnM r   c                    U R                  5       nUS:X  a  SU S3$ U R                  nS/X#-
  -  S/U-  -   nU SSR                  U5       S3$ 	NrN   z!triton_helpers.promote_to_tensor(r  r  r   r  r  r  r   rb  r*  )r  r  ndimsnreducesizess        r   reduction_resizeTritonKernel.reduction_resize  sh    '')A:6ugQ??)))VHw,>>$))E*+1--r   c                    U R                  5       nUS:X  a  SU S3U4$ U R                  nS/X4-
  -  S/U-  -   nUb  / US X4-
   QS/U-  Q7OS nU SSR                  U5       S3U4$ r  r  )r  r  r   r  r  r  r  s          r   reduction_resize_and_shape'TritonKernel.reduction_resize_and_shape  s    '')A:6ugQ?FF)))VHw,>>=B=N9e'u(9A3=9TX 	 $))E*+1-y88r   c                   U R                   S:X  a  U$ U R                  5       U R                   -
  nU R                  5       nUSU S/-   nU R                  R	                  U[        [        U5      XV5      U[        U5      S9$ )z3
Reshape to RBLOCK, collapsing all reduction dims.
rN   NrD  r5  )rb  r   rK  r   r9  r  r   r   )r  r#  r  r  target_ndimr  target_shapes          r   reduction_collapse_dims$TritonKernel.reduction_collapse_dims  s     ""a'L--/$2I2II,,.$\k2hZ?xx  3u:}C%	 ! 
 	
r   c                J  ^ ^^^^:^;^<^=^>^?^@^A SES jn[         R                  " U5       Vs/ s H  ofR                  PM     nn[         R                  " XT5      n[	        S U 5       5      (       aJ  [
        R                  " T[
        R                  5      m[
        R                  " T[
        R                  5      mT R                  (       d   e[        S T R                   5       5      nT R                  U5        [        U5      nT R                  (       a  UR                  T R                  5        T R                  S   R                  S   n	T R                   (       ab  T R#                  5       n
[%        U
5      S:  d   eU
 Vs/ s H  nSU;   d  SU;   d  M  UPM     nnS	S
R'                  U5       S3m;[)        U5      m@O)T R+                  5       m;[)        T R#                  5       5      m@T R-                  U;U U@4S jU5      nSnTS;   a  [/        U[(        5      (       a  Uu  pMT R1                  5       T R2                  -
  m<      SFU<UUU 4S jjm=        SGU=4S jjnU<UU>U 4S jnTTU4nUT R4                  R6                  ;   a  T R4                  R6                  U   $ [9        T5      n[;        T5      n[=        T R#                  5       5      nSUT<'   T R4                  R?                  U[)        U5      S9n[        S U 5       5      Ul         SR'                  U5      m:U:4S jmAT RB                  (       Ga  [D        RF                  RI                  TT5      nS m?SHU UU?UA4S jjnTS:X  a  SnO|[/        U[(        5      (       a)  [K        UU5       VVs/ s H  u  nnU" UU5      PM     nnnO>TS:X  a/  T R4                  RM                  T RN                  XDR                  S9nO	U" UU5      nTS;   a  [/        U[P        5      (       d   e[R        RT                  RW                  5       nU(       a"  S[Y        U5       ST R[                  U5       S3nO@[Y        T R4                  RM                  T RN                  S U	 S!U S"3UUR\                  S95      nS#S$S.T   m>U" T RN                  UUU5        UUl        GOTS%:X  a=  T R^                  (       a  T Ra                  UTUTAUT5      nGOT Rc                  TU5      nGOvTS&:X  aS  [/        U[d        5      (       d   eUu  nnn[)        UU 4S' jT Rg                  T RN                  UUUT<T5       5       5      nGOTS:X  a  T Ri                  TU5      nGO[/        U[P        5      (       d   eT=" T RN                  UUR                  5      u  nn n!T R4                  RM                  T RN                  UU U!S9nGOT R4                  Rk                  S(U 3U[)        T R#                  5       5      S9n"[D        RF                  Rm                  TT5      nT R-                  [n        U5      n[/        U[(        5      (       d  TS:X  a  T R#                  5       n
[%        U
5      S:  d   eU
 Vs/ s H  nSU;   d  SU;   d  M  UPM     nn[)        U5      U"l.        S	S
R'                  U5       S3m;T Rp                  Rs                  U" S)T; S
U S
U S35        O5T Rp                  Rs                  U" S)T R+                  5        S
U S
U S35        TS;   Ga  S(U S*3nT Rt                  Rw                  5       n#T Rp                  Rs                  U S)T R+                  5        S
[
        Rx                  " U#5      Rz                   S
T R[                  U#5       S35        S#S$S.T   m>Ub!  S[Y        U5       ST R[                  U#5       S3OU	 S+3n$T RN                  R}                  S,U" S-U S.T> S/U" S
U S
U S
U$ S0U" S1TA" U" S23U"5       S3U S1TA" U S23U5       S335        U" T R~                  UU"U5        GO7[        T5      (       a  T Ra                  UTUTAUT5      nGOTS:X  Ga0  S(U S43n%S(U S53n&T Rp                  Rs                  U% S)T R+                  5        S6U S35        T Rp                  Rs                  U& S7T R+                  5        S
U S35        T RN                  R}                  S8U% S-U& S9U% S
U& S
U S
[        R                   S:35        T RN                  R}                  S8U% S1TA" U% S23U%5       S8U& S1TA" U& S23U&5       S83	5        Un'T R4                  R?                  TU'R\                  S9n(T R                  T R~                  U'U(U%U&T<T5      nO[D        R                  " TT5      n)U)" U"U5      n*TS:X  a!  T RN                  Rs                  U" S1U* 35        O'T RN                  Rs                  U" S1TA" U*U"5       35        T[
        R                  :X  aA  T R4                  RM                  T R~                  U" S;3[
        R                  U"R\                  S9n"U" T R~                  UU"S5        T R^                  (       Ga  [D        RF                  Rm                  TT5      n[        R                  " 5       n+T R~                  T R                  4 H3  n,U,Rs                  S<5        U+R                  U,R                  5       5        M5     TS;   a  T R~                  Rs                  U S=T R                  U S>35       35        T R                  U S?3TU5      n-T Rt                  Rw                  5       n#T R                  UU#[
        Rx                  " U#5      Rz                  5      n.U" T R                  UU-U.5        GOR[        T5      (       a  TS%:X  d   eUu  n/n0n1T R                  U/[;        T5      US   5      n2T R                  U0[;        T5      US@   5      n3T R                  U1[;        T5      USA   5      n4T R                  T R                  U/U0U1U2U3U4T<T5	        OTS:X  a|  Uu  n'n([/        U[d        5      (       d   eT R                  U'[;        T5      US   5      n5T R                  U([;        T5      US@   5      n6T R                  T R                  U'U(U5U6T<T5        O1T R                  U[;        T5      U5      n7U" T R                  UU7S5        U+R                  5         UT R4                  R6                  U'   [/        U[(        5      (       a  [        SB U 5       5      (       d   eT R                  R                  U5        TSC;   a  [%        U5      S@:X  d   e[%        U5      U-  n[%        U5      [%        U5      :X  d   e[K        UU5       HJ  u  n8n9U9c   eU8R                  U9:w  d  M  T R~                  Rs                  U8 S1U8 SD[        U95       S35        ML     U$ [/        U[        5      (       d   eT R                  R                  U5        UR                  US   :w  a8  US   c   eT R~                  Rs                  U S1U SD[        US   5       S35        U$ s  snf s  snf s  snnf s  snf )IzC
codegen reduction of value to Triton according the reduction_type
c                    U R                   [        R                  [        R                  4;   a%  [        R
                  " U [        R                  5      $ U $ r   )r  r   r  r  r  r  r   rW  s    r   maybe_upcast,TritonKernel.reduction.<locals>.maybe_upcast  sF     ;;MMNN UEMM2 r   c              3  f   #    U  H'  o[         R                  [         R                  4;   v   M)     g 7fr   )r   r  r  r?  s     r   r  )TritonKernel.reduction.<locals>.<genexpr>  s      M_U]]ENN33_s   /1c              3  >   #    U  H  oR                    S 3v   M     g7fr  r   r  s     r   r  r         M<LDkk]$/<Lr  rU  r   r   XYr  r  r  c                r   > TR                   R                  TR                  SU  ST S3U R                  TS9$ )Nr  r  r  r5  )r   r9  r:  r  )r0  r  r  r  s    r   <lambda>(TritonKernel.reduction.<locals>.<lambda>.  s=    dhh''"1#R'7q9gg!	 ( r   N)argminargmaxc                  > [        T	5      nT
R                  XT5      nT	S:X  a_  [        T
R                  5       5      S:H  nUR                  c   eU(       a  U S3nS/UR                  QSPnO=U S3n/ UR                  QSPnO'T
R                  U SU ST S3UR                  5      u  pVUb  U S	T
R                  U5       S3nOUR                  nXRU4$ )
z3
Helper to generate a reduction call, e.g. tl.sum.
r4  r  z[None,:,:,None]rN   z
[:,:,None]r>  r  r  r  )r   r  r   rK  r   r  rn  r  )r#  r  result_typetriton_reduction_fnis_bmmru  r   rq  r  r   r  s          r   final_reduction/TritonKernel.reduction.<locals>.final_reduction?  s     #@"O00FE&T11349{{... %wo6F00a0E %wj1F-ekk-1-E $ ? ?*+1UG2cU!<ekk! &"84(9(9+(F'GqI#kk--r   c                L   > T" XU5      u  n  nU R                  U SU 35        g)z=
Generate a reduction and assign it to an existing variable.
r   N)r   )r#  rO  r  r  r\  r  s        r   final_reduction_define6TritonKernel.reduction.<locals>.final_reduction_define`  s.     *&EKE1aMMZLE734r   c                   > TR                  XT5      nTR                  XT5      nU R                  SU SU ST SU SU ST SU STR                  U S35       S	35        g )
N                z_val, z_idx = triton_helpers.z_with_index(r  )
                r   _idx
                )r  r   r  )r#  rO  r  r  rq  r  root_opr  s       r   final_argreduce/TritonKernel.reduction.<locals>.final_argreducem  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr   r   r5  c              3  P   #    U  H  n[        US    5      (       a  M  Uv   M     g7fr  )rB   )r   r   s     r   r  r    s!      *
 C(;CF(CCC5s   &	&r)  c                B   > T(       d  U $ [         R                  TX5      $ r   )r8  r(  )tvalfvalr  s     r   r>  *TritonKernel.reduction.<locals>.where_cond  s    (..tT@@r   c                ~   UR                   (       d  U[        R                  :X  a  U $ X:X  d  U S:X  a  U $ U [        R                  " U5      R                  :X  a   [        R                  " U5      R                  $ U [        R                  " U5      R
                  :X  a   [        R                  " U5      R
                  $ U $ )z7update reduction constant mask value to match dst_dtyper   )r  r   r   iinfor{   r|   )r  r  r  s      r   update_constant_dtype5TritonKernel.reduction.<locals>.update_constant_dtype  s    
 ..)uzz2I#O)X]#Ou{{95999 ;;y1555Y!7!;!;; ;;y1555#Or   c                   > T" UTU R                   5      nTR                  [        U5      nTR                  R	                  TR
                  T" X5      U R                   U R                  S9$ )Nr5  )r  _map_tuple_or_scalarr`   r   r9  r:  r   )r  r;  default_strr  r  r  r>  s      r   _mask_value+TritonKernel.reduction.<locals>._mask_value  s`    /EKKP"77wOxx((LLu2++++	 )  r   online_softmax_reducer4  r  )r  r  r>  r  r  r  zindex, z.shape)r{   r|   welford_reducewelford_combinec              3  t   >#    U  H-  u  pTR                   R                  TR                  UTUS 9v   M/     g7f)r5  N)r   r9  r:  )r   r  r   r  r  s      r   r  r    s:      #) HH%%dllEe%T)s   58r\   = tl.full(_indexr  r  _next, z_next = triton_helpers.z%imum_with_index(
                    #
                )
                r   _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrN   r!   c              3  B   #    U  H  n[        U[        5      v   M     g 7fr   )r   r  r?  s     r   r  r    s     LAz!%677s   )r  r  r  )r  rT   r   rT   )r  rT   r  r%  r   z1tuple[str, Optional[torch.dtype], BlockShapeType])rO  rT   r  rT   r  r%  r   r   r   rT   )Vpytreetree_leavesr  tree_maprz   r   rN  r   rc  r   ro  r  r+  r  r  r   rJ  rK  r   r*  r   r  r  r   r   rb  r   reduction_cacher  r  rk  rl  r  r  r#   r6  r7  r  r9  r:  rT   rL   r   rJ  r   rn  r   rS  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackr  default_accumulatorr`   rj  r   re  select_index_dtyper  r{   r   rC  r@   r"   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   r  r  r  rD  r  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr  r\  rE  r  r  r  r  )Br  r  r  r   r  r  r  original_dtypesr  reduction_range_prefixdense_sizesr  xy_sizes_onlylogical_indexr  r  rn  acc_typetorch_acc_typeresult_shaperO  r;  r  masked_valuer0  rA  accumulator_dtypeaccumulator_indexmeanm2weight_result_dtype_shapeaccumulatorr  rP  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedr  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr   
orig_dtyper  r  rq  r  r  r  r  r>  sB   ````                                                      @@@@@@@@r   	reductionTritonKernel.reduction  s\   	 170B0B50IJ0I990IJ4M_MMM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!?   ..0K{#q(((.9XkdSD[CSWKTkMX =!9 :!<N.K!002N 4 4 67K )) 
 11%''',$%%'$*A*AA	.	. /	. ?		. 	.B	5#	5 	5 /		5
 	5	 	 6	00088++I66"9-))4D0023S((// l(; * 

  * *
 *
  

 zz% 	A
 $$$ll00KG$$	 	 !88  $E5))>A%>QR>QdaAq 1>QR5(
  $xx00uKK0X*5':!55!,<<<<$%HH$K$K$M! *+C,>+?uTEVEVWhEiDjjk(l%(+)) LL./E.Fgl^[bc"3"."4"4	 * )% &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,9999%1"r6" #(,dBU)# 
  #:: "BB5%P
!,<<<<*9LL,0B0B+' "XX..LL'v / 
 ((++J< $D0023 , K
 ll66~yQG//wGGgu--!U*"&"6"6"8K{+q000)4%)4tsd{ " % ).m(<K%'(=)A(B!%DNII''&-{>2B"WIRPXzYZ[ II''&-{43F3F3H2IG9TVW_V``ab !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L %0 M*+51B1B;1O0PPQR2359 
 ##W%6$77Nwi X M$5#6br) MS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n55!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5
@P@P_Q
!GG**##
  88S
$[%8!U*LL**k]#gY+GHLL**&-s:g{+K*LM 

* #'(("3"3..&-|4#jj)//	 #4 #K '**JT %%%ll66~yQG#--/J..0D0DE./((6 F
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n55%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
J!'84444JJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W 't';';ZPTU.8  +j%((LLLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z?#CZ!---99
***44%s3%t,?
,K+LAN $D"  j*;<<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` a K4 Yd  SR%s#   zzzz5z 	z c                   U R                  XU5      nU R                  XU5      n[        S5       Vs/ s H%  n[        U R                  R	                  US95      PM'     snu  pxUR                  SU SU SU SU SU S[        R                   SU SU R                  U 5       SU SU R                  U 5       S35        Xx4$ s  snf )Nr!   r  
            r  9 = triton_helpers.online_softmax_reduce(
                )
            r   )	r  rm  r   r   rl  r   r"   r  r  )	r  r#  r"  r#  rq  r  r\  r$  r%  s	            r   _online_softmax_reduce#TritonKernel._online_softmax_reduce  s     66vPUV66vPUVMRSTX!VX#dhhooEo&B"CX!V
L:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 %% "Ws   ,Cc           
       ^ ^^^ UUU 4S jX#U4 5       u  p#nSU SU SU ST S3	nU4S jnX#U4 V	s/ s H-  n	T R                   R                  TU" U	R                  5      S9PM/     n
n	TR                  SR	                  U
 Vs/ s H  n[        U5      PM     sn5       SU 35        [        U 4S jU
 5       5      $ s  sn	f s  snf )	z+
Helper to codegen triton_helpers.welford.
c              3  J   >#    U  H  nTR                  TUT5      v   M     g 7fr   )r  )r   r  r#  r  r  s     r   r  (TritonKernel._welford.<locals>.<genexpr>  s*      
+ ((>>+s    #ztriton_helpers.welford(r  r  c                2   > [        U ST U TS-   S  -   5      $ )Nr   rN   )r   )r   rq  s    r   reduced_shape,TritonKernel._welford.<locals>.reduced_shape  s$    qcAgi(8899r   r5  r   c              3  Z   >#    U  H   nTR                  XR                  5      v   M"     g 7fr   )r  r   )r   r  r  s     r   r  r?    s*      
( ++E;;??(   (+)r   rl  r   r   r*  r   r   )r  r#  r  r  r  rq  r  welfordrA  r  welford_resultsr$  s   ``   ``     r   r  TritonKernel._welford  s    
F+
& ,D6B4r&C5J	:
 F+
+ HHOO%}U[[/IOJ+ 	 
 	DII&G!s1v&GHIWIVW 
(
 
 	

 'Hs   4B=Cc                   U R                  5       U R                  -
  n[        U S3[        U R	                  5       5      U[
        R                  " 5       S9n[        U S3[        U R	                  5       5      U[
        R                  " 5       S9n	[        U S3[        U R	                  5       5      U[
        R                  " 5       S9n
U R                  R                  U SU R                  5        SU S35        U R                  R                  U	 SU R                  5        SU S35        U R                  R                  U
 SU R                  5        SU S35        US:X  a=  Uu  pnU R                  R                  S	U S
U	 S
U
 SU SU	 SU
 SU SU SU S35        O9US:X  d   eU R                  R                  S	U S
U	 S
U
 SU SU SU	 SU
 S35        U R                  R                  SU SU" U S3U5       SU	 SU" U	 S3U	5       SU
 SU" U
 S3U
5       S35        UnU R                  U R                  USSUU	U
UU5	      $ )z%Helper to codegen a welford reduction_meanr   r  r  _m2_weightr  r  r  r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    r  r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r   r  r8  N)r   rb  r  r   rK  r    unknownrj  r   r  r:  r   r  rC  )r  rO  r   r  r>  r  r  rq  r!  accumulator_m2accumulator_weightr  r  r  r+  s                  r   r  TritonKernel.welford_reduce  s    %%'$*A*AA'l% ,,./&&(	
 +l#,,./&&(	
 /l'",,./&&(	
 			m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$DfLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !22""

 
	
r   c
           
     
   [        U R                  XXgX5      5      n
X#U/n[        [        X5      5       HA  u  nu  nu  pUc  U R                  R                  XS9nXU'   UR                  U SU 35        MC     [        U5      $ )z0Helper to codegen call to triton_helpers.welfordr5  r   )rk  r  r  r  r   rl  r   r   )r  r#  r+  r,  r-  r  r  r  rq  r  r  result_exprsrZ  result_exprr  r   s                   r   r  +TritonKernel.welford_reduce_final_reductionI  s     dmmF"cIJ#>09#l:S0T,A,^e""hhooEoG"-QMM[MUG45	 1U \""r   c                   U R                  XU5      nU R                  XU5      n	UR                  SU SU SU SU	 SU S[        R                   SU SU R	                  U 5       SU SU R	                  U 5       S35        X#4$ )Nr8  r  r9  r:  r   )r  r   r"   r  r  )
r  r#  r$  r%  r1  r2  rq  r  r"  r#  s
             r   r  2TritonKernel.online_softmax_reduce_final_reductiona  s     66vO66vOL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 %%r   c                N    U R                   (       a  U R                   S   $ [        $ )NRSPLIT)r#  r3   r  s    r   
max_rsplitTritonKernel.max_rsplitp  s"    $$X..  r   c                   U R                   S   nU R                  5       (       d  SOSnXBR                  -  U R                  5       -  nU R                  R                  U5      u  pxU R                  R                  SU SU SU R                  U5       S[        U5       SU S	U S
U S3SS9  U R                  U S3SS/U[        R                  " 5       S9n	U R                  R                  U	 SU S[        U5       S35        U	$ )z
Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
column.  After the barrier, every thread block loads the completed value so that it can compute the final
value independently.
rX  zxindex < xnumelNr  z_ws = (re  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r  r:  Tstrip_peersr   rX  rJ  z = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, rE  )rd  r  r  rY  r{  r  rC  r   r  rF   create_cse_varr    rM  rD  r   r`   )
r  rO  r  default_valxnumelr  r  r  r  r3  s
             r   r  7TritonKernel.codegen_cooperative_reduction_peer_combineu  sR    S!(,(@(@(B(B ..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 ##l&!X&&&(	 $ 
 	&&g[ -eers~e  eA  ACD	
 r   c                |   U R                   (       d   eSU l         [        R                  R                  U5      nU R	                  USU R                  U USSS9S9nSU l         U R                  R                  U5      n[        R                  " 5       nU R                  (       a*  UR                  U R                  XR                  5      5        U R                  U R                  U5        [        U[         ["        45      (       aZ  U R                  R%                  ['        UU R)                  UUUR+                  U5      USUR-                  5       < 35      5      5        O[        U[.        5      (       d   eUR0                  n[3        U5      (       a`  UR4                  bS  [7        S UR4                   5       5      (       d2  SR9                  [;        [<        UR4                  5      5      n	USU	 S	3-  nU R                  R%                  ['        US
U SU SU SUR>                   S	3	5      5        URA                  5         g )NFT)r   r  r  r  rE  r  c              3  >   #    U  H  n[        U5      S :H  v   M     g7fr  r  r?  s     r   r  /TritonKernel.store_reduction.<locals>.<genexpr>  r  r  r  r  r  r  r  r  )!rc  rL   rZ  r  rI  rk  r  rT  r  r  rS  r  r  rD  rI  r   r  r  r   rU   r  r  r  r  r  r   r   r\  r*  r,  r   r-  r  )
r  r   r  r  r  rI  r   r  r  r  s
             r   store_reductionTritonKernel.store_reduction  s	    $$$$ %!!$'==&*&H&H	 'I ' ! 	
 !%iit$))+
%%$$,,T3G3GH 	&&t';';SAh2I JKK  **55  ,+H,C,C,E+HI	 h8888#--L%e,,KK+?5;;???"iiC(=>.Q ??  **uDc%8CTCTBUUVW 	r   c                f  ^^^^^^^ [        5       mTR                  S5        [        5       m[        S5       V^s/ s H+  m[	        UU4S j[        [        X#5      5       5       5      PM-     nnSR                  S [        R                  R                  U5       5       5      nTR                  SU S35        [        5       mSmS	S
KJn  S	SKJn  U" 5       mU" 5       m " UUUUUU4S jS[         5      n	TR#                  5          [$        R&                  " U	" 5       5         U" U6 n
SR                  S U
 5       5      n
TR                  SU
 35        S S S 5        S S S 5        U R(                  R+                  TR-                  5       TS9$ s  snf ! , (       d  f       NC= f! , (       d  f       NL= f)Nz@triton.jitr!   c              3  n   >#    U  H*  u  nu  p#TR                  S T SU 3X2R                  S9v   M,     g7f)r  r\  r5  N)r  r   )r   r  r  r  r   rZ  s       r   r  ,TritonKernel._lift_helper.<locals>.<genexpr>  s<      )G%A~ s1#Qqc]%{{K)Gs   25r  c              3  8   #    U  H  n[        U5      v   M     g 7fr   r  r?  s     r   r  rj    s     R.Qc!ff.Qrj  zdef {name}():r  r   rn   )ShapePropagationOpsHandlerc                  @   > \ rS rSr        SU UUUUU4S jjrSrg)+TritonKernel._lift_helper.<locals>.CSEProxyi  c                   > T	SU 3-  m	[        TU5      " U0 UD6n[        TU5      " U0 UD6nTR                  T[        T
U5      " U0 UD6UUS9$ )Nr\  r5  )r  r9  )r  r   r  r  output_dtypeoutput_shaper   dtype_handlerhelperhelper_name	overridesshape_handlers         r   _default4TritonKernel._lift_helper.<locals>.CSEProxy._default  s     4&z)&!   # " #
  '!   # " #
 ||It,d=f=&&	 $  r   r   N)r   r   r  ztuple[Any, ...]r  rx  r   r	   )r   r   r   r   rx  r   )r   rs  rt  ru  rv  rw  s   r   CSEProxyro    s-    '6@N r   rz  c              3  8   #    U  H  n[        U5      v   M     g 7fr   r  )r   rT  s     r   r  rj    s     B'F'rj  return r  )rV   r   rS   rm  r   r  r  r*  r  r  from_iterabler  r  ro   !torch._inductor.shape_propagationrm  r,   r  rL   set_ops_handlerr9  r  r   )r  r@  r  dtypesrZ  r  	signaturero   rm  rz  outputsr   rs  rt  ru  rv  rw  s       `      @@@@@@r   _lift_helperTritonKernel._lift_helper  sg   
  !'e 1X

 	  )23v3F)G   	 
 IIRioo.K.KD.QRR	=267#%	 *PP2424	 	~ 	0 ]]_a//
;$iGiiB'BBGwwi01 <_
 $$(():k(RRk
` <;__s)   2FF""2FF"
F	F""
F0c                  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       e/ n/ n[        S U 5       5      n[        R                  " T R                  R                  T R                  5      nT R                  X#U5      nT R                  5       T R                  -
  n	[!        X15       GHV  u  pT R                  R                  T R                  U
 S[#        U5       S3UU
R$                  S9nT R                  R                  T R                  SU S	T R'                  5        S3U[        T R)                  5       5      S9n
UR+                  U
5        [-        U5      nT R.                  (       a  M  T R)                  5       nS
US'   T R                  R1                  XS9nSS	R3                  U5       S3nUR4                  (       a  SOSnT R6                  R9                  U SU S	U S	U S35        UR+                  U5        GMY     S mUU 4S jnU" ST" U5       SU	 S	U S3UUU5      nT R.                  (       d  S nU Vs/ s H(  nU" SU S3[;        UR<                  5      U" U5      S9PM*     nnU" [        U5      [        U5      5      nU" [        U5      U5      n[!        UU5       VVs/ s H)  u  nnU" SU S	U S3UR<                  UR$                  S9PM+     nnn[!        UUU5       H+  u  nnnT R                  R9                  U SU S	U S35        M-     OUnU H*  n[?        U[@        5      (       d   e[        U5      Ul!        M,     [        U5      $ s  snf s  snnf )z*
Perform an associative scan on 'values'.
TODOc              3  >   #    U  H  oR                    S 3v   M     g7fr  r  r  s     r   r  $TritonKernel.scan.<locals>.<genexpr>#  r  r  z(ops.scan not supported inside ops.maskedc              3  8   #    U  H  n[        U5      v   M     g 7fr   rH   r   r  s     r   r  r  +       Fve*511vrj  r  r  r5  r  r  r   rU  r  r  zfloat('nan')z-1r  c                2    SR                  S U  5       5      $ )NrD  c              3  *   #    U  H	  o S 3v   M     g7fr  Nr   r   r  s     r   r  1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>O       <VEgQKVr^  r*  r  s    r   csvTritonKernel.scan.<locals>.csvN      88<V<<<r   c           	     d  > [        U5      n[        U5       Vs/ s H  oP SU SU 3PM     nn[        U4S jU 5       5      (       a,  U Vs/ s H  nTR                  R	                  U5      PM      sn$ [        X15       VV	s/ s H(  u  pTR                  R                  XR                  S9PM*     n
nn	TR                  R                  T" U
5       SU  35        [        X5       H-  u  pU(       a  X+l
        TR                  R                  X{5        M/     [        U
5      $ s  snf s  snf s  sn	nf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7fr   r   containsr   rn  r  s     r   r  :TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>T  #     LI488$$Y//rD  r5  r   r   rm  r\  r   r  r  rl  r   r:  r   r  rm  r   )r"  r  r  r  r  rZ  
cache_keysrn  r  r  result_varsrO  r  r  s               r   cse_multiple'TritonKernel.scan.<locals>.cse_multipleQ  s   FA;@8D8aF"QCr%18JDLLLLAKLIY/LL '*&&9&9NU e;;?&9   LL""{#$Cv. *-[)E%
+0(Y3 *F %% EL   D"%D'/D,ztl.associative_scan((r  c                V    U R                   c  g [        U R                   5      nSUS'   U$ )Nr   rU  )r   rk  )r   r   s     r   _partial_scan_shape.TritonKernel.scan.<locals>._partial_scan_shapen  s*    99$ OE #E"I Lr   ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"rc  rS  r   ro  r  r+  r  r   r;  r<  r   r9  r:  r  r   rb  r  r  r   r  rK  r  r  r  rl  r*  r  rj  r   rH   r  r   r  r  ) r  r  r&  r  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnrq  r  r  value_dtyper  reduced_sizer!  reduced_size_strr;  r  partial_scan_varsr  partial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr  acc_nextpartial_reducerO  r  s    `                              @r   scanTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j&I%%'$*A*AA/LE((++'1%89;kk	 , K HH%%";-r$2E2E2G1HJD0023	 & E %%e,&u-H,,,#335#&R "hhooEoN%&tyy'>&?q#A ,1,C,C.		##"m;/?.@7)2hZWXY ##K09 0<	=	&$ )#C(:$;#<CuBGXFYYZ[	
 ((! ):# ):$ 12B1CCtu-.>.D.DE-.>?
 ):   # #5#6>Q8RSI'l(;=NON 03>CT/U 0V+I| ,YKr,qI&,,&,,
 0V   :=<)<:5+~ &&"m#<XJbHXXYZ: ,K%Jj*;<<<<#-e#4J  & [!!?#s   /N50N:c                h  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       eT R                  (       d   S5       e[        R                  " T R                  R                  T R                  5      nT R                  5       T R                  -
  n[        S U 5       5      n[!        U5      [!        U5      :X  d   e[#        U5       VV	s/ s H;  u  pU" SU	 ST R%                  5        S3X   [        T R'                  5       5      S	9PM=     n
nn	S
 mUU 4S jnT R                  S   R(                  (       d   eT R+                  T R                  S   5      (       a  SOSn[!        U5      S:X  a%  SU
S    SU
S    SU SU SU SU S3nU" XXQ5      nO[-        S5      e[/        X5       H  u  nnX_l        UR2                  Ul        M     [        U5      $ s  sn	nf )Nr  c              3  >   #    U  H  oR                    S 3v   M     g7fr  r  r  s     r   r  $TritonKernel.sort.<locals>.<genexpr>  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  8   #    U  H  n[        U5      v   M     g 7fr   r  r  s     r   r  r    r  rj  r  r  r  r5  c                2    SR                  S U  5       5      $ )NrD  c              3  *   #    U  H	  o S 3v   M     g7fr  r   r  s     r   r  1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>  r  r^  r  r  s    r   r  TritonKernel.sort.<locals>.csv  r  r   c           	     d  > [        U5      n[        U5       Vs/ s H  oP SU SU 3PM     nn[        U4S jU 5       5      (       a,  U Vs/ s H  nTR                  R	                  U5      PM      sn$ [        X15       VV	s/ s H(  u  pTR                  R                  XR                  S9PM*     n
nn	TR                  R                  T" U
5       SU  35        [        X5       H-  u  pU(       a  X+l
        TR                  R                  X{5        M/     [        U
5      $ s  snf s  snf s  sn	nf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7fr   r  r  s     r   r  :TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  r  rD  r5  r   r  )r"  r  r  r  r  rZ  r  rn  r  r  r  rO  r  r  s               r   r  'TritonKernel.sort.<locals>.cse_multiple  s   &'A;@8D8aF"QCr%18JDLLLLAKLIY/LL %($C$CLE e;;?$C   LL""{#$Cv. *-[)E%
+0(Y3 *F %% ELr  rU  r   rnumelr!   ztriton_helpers.sort_with_index(r   rN   z	, stable=z, descending=zUnhandled sort)rc  rS  r   ro  r  r+  r  r  r;  r<  r   r9  r:  r   rb  r   r   r  r  rK  rf  r,  rj  r  r  r  )r  r  r  stable
descendingr  r  rq  rZ  r  r  r  r  r"  r  rO  	input_varr  s   `                @r   r  TritonKernel.sort  s5    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k))) &f-
 . "5'D,?,?,A+B!DiD0023
 . 	 
	=	&$ #00002243C3CB3GHHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tOK !122%(%=!J	#(  ) 0 0J &> [!!]
s   "AH.c                    U R                   (       d  gUR                  U R                   5        U R                   R                  5         U R                  R                  5         g)z
Generate the output from prologue. This should be
extracted from the subgraph, which is why this is
partitioned from codegen_body.
N)rB  r   clearrA  )r  rT  s     r   codegen_prologueTritonKernel.codegen_prologue  s@     }}DMM"!!#r   c                <   U R                   (       dV  U R                  (       dE  U R                  (       d4  U R                  (       d#  U R                  (       d  U R
                  (       d  gU R                   Vs/ s H  oR                  (       d  M  UPM     nnU R                  (       Ga  U R                  (       d   S5       e0 n[        U R                  5       H  u  pEUR                  n[        R                  R                  U[         R"                  5      nU R%                  [&        U5      nSU 3nU R(                  R+                  U SU S35        U R,                  R/                  U[         R"                  SS9X8'   M     U R(                  R+                  S5        U R(                  R+                  S	5        U R(                  R1                  S
S9   U R3                  5       (       d[  U R                  S   n	U	R4                  S:X  d   eU	R4                  n
U R(                  R+                  U
 SU	R6                   SU
 S35        U R(                  R9                  U R                   5        U R(                  R;                  S/5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U R(                  R9                  U R
                  5        [        U R                  5       H  u  pEUR<                  nSU 3n[        R>                  " UR                  [         R"                  5      n[A        UR                  5      nU R,                  RC                  U R(                  U SU S3URD                  SS9nSSK#nURH                  RJ                  RM                  U SU R(                  5         U" X8   U5      nSSS5        U R(                  R+                  U SW 35        M     SSS5        [O        [Q        U R                  5      5       H%  nU R(                  R+                  SU SU S35        M'     GO;U RR                  (       Ga  [Q        U5      S:  Ga  [        U5       GH  u  nnU R(                  R1                  US9   UR4                  nU RT                  (       a  SOSnU RT                  (       a  SOU S3n[         RV                  RX                  (       a  [[        5       S:  a  SnOSnU R(                  R+                  S U S!U S"U S"UR]                  5        S#U S$35        SSS5        U R(                  R1                  US
-   S9   U R_                  XR(                  5        SSS5        GM     U R(                  R1                  [Q        U5      S9   U Ra                  U R(                  5        U R(                  R9                  U R                   5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        SSS5        [c        / [        U5      Q5       GHr  u  nnU R(                  R1                  US
-   S9   U Rd                  URf                     Ri                  5        H  u  nnU[Q        U5      S
-
  :  av  UUS
-      nU Rd                  URf                     U   n[j        Rm                  U5      n[o        URp                  U5      n[s        UU5       VVs/ s H  u  nnUUU-  -
  PM     nnnU R(                  R+                  [u        U Rv                  U   U S%U S"[x        Rz                  R}                  U5       S&35      5        M     SSS5        U R,                  R                  U R                  5        UR                  5         GMu     OU R(                  R9                  U R                   5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U R(                  R9                  U R                  5        U RT                  (       ai  U R                  (       d  U R
                  (       aG  U R                   S'3nU R(                  R9                  S(U S)3S*S+9  U R                  R                  5         U R                  (       d%  U R(                  R9                  U R
                  5        U R                   R                  5         U R                  R                  5         U R                  R                  5         U R                  R                  5         U R                  R                  5         U R
                  R                  5         gs  snf ! , (       d  f       GN6= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GM  = f! , (       d  f       GN= fs  snnf ! , (       d  f       GN= f),z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
Nz1Mix order reduction requires persistent reductionaccumz = tl.full([R0_BLOCK], z, tl.float32)[None, :])r   R0_BLOCKr5  z/split_size = min(RSPLIT_SIZE, xnumel - xoffset)z@for _ in tl.range(0, split_size, XBLOCK, num_stages=NUM_STAGES):rN   )r   r   rX  mask =  < r  zxindex += XBLOCKr>  z, 0))r  r:  r   z&tl.store(ws_ptr + (tl.program_id(0) + z3 * tl.num_programs(0)) * r0_numel + r0_index, accumz
, r0_mask)rsplit_startr  
rsplit_end)r   r!   z, num_stages = 2r   zfor zoffset in tl.range(r  r   rl  z = tl.advance(r  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Tr\  )Findexing_coder'  r  r:  rC  rD  ro  r  mix_order_reductionr  r  r\  r   r#   r6  r
  r   r1  r  r`   rj  r   r   r  r  r  r   r   r   
writelinesr  r  r   r9  r  unittestmockpatchobjectrm  r   rc  rS  r  r  r   r   r  r  rl  rJ  r   r  r   r   r   r  r  rU   rI  rL   r   r  
invalidaterE  cache_clearrz  r{  r  r  )r  r   
loop_treesaccumname2varr  partial_accumr   r;  r   entryrX  r   r&  triton_reduction_functionnewvalr  r'  levelr   
loop_startloop_end
num_stagesrF  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                                  r   codegen_bodyTritonKernel.codegen_body  s    zz{{||%%##'+'7'7H'7t<<d'7
H###,, C, M&/0M0M&N"!.!=!=,,::>5;;W33M7Kse}		##f3G9<RS '+hh&7&73D '8 '# 'O II QRIIR !!!+//11 ,,Q/E <<3...AII''1#WUZZLA3e(LM		  !3!34		$$*
 		  ,		  .		  -		  !5!56 +4D4Q4Q*R&C'--C"3%=D!#!<!<%44ekk"J 1N%441- "XX..		45Qse4@!ii+	 / F $!,,33D)TYYO",)/"# P
 II''4&G9(=>- +S' ,V S!>!>?@		##<SEAtuxty  zD  E A
 """s:':(4tYY%%U%3![[F373M3MSVJ(,(B(B6(RWHX 
 }}((-?-AF-J%7
%'
II''vh&9*RzQSTZT`T`TbScchishttvw 4 YY%%UQY%788yyI 87!  5( !!Z!9..tyy9		  !3!34		  ,		  .		  - :  ((@)J*?(@AtYY%%UQY%7262K2K		3eg3.	; !3z?Q#66(2519(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[BR1S+1SIC !$d]&: :1S ( +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!3 84 ##D$:$:;  "9  B< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD''IIT112  "

$$&""$s Ir POI ,+f 43 87 :9,+ 87sw   6h&h&H h=0h+<,h=B%ii!B0i4Bj)j>Aj+
h:5h==
i
i	!
i1	4
jj
j	c                   / nU R                  5       (       Ga^  / nU R                  SU/ 5        U GHB  n[        U[        5      (       a  UR	                  [        U5      5        M5  [        U[        5      (       ag  [        R                  R                  R                  UR                  U R                  [        R                  S9nUR	                  [        U5      5        M  [        U[        R                   5      (       a^  [        R                  R                  R                  UU R                  [        R                  S9nUR	                  [        U5      5        GM.  [#        S[%        U5       35      e   U$ )Nr   rK  fallbackz!Unsupported numel argument type: )r  add_numel_to_call_argsr   r  r  r   rl   rL   rZ  rR  	size_hint
inner_exprrK  r"   unbacked_symint_fallbackr   r   r  rF  )r  r  
numel_argsr  hints        r   kernel_benchmark_extra_args(TritonKernel.kernel_benchmark_extra_args  s    !!+-J''J;!c3''KKC)_5577++55&*&8&8!'!@!@ 6 D
 KKD	*UZZ0077++55&*&8&8!'!@!@ 6 D
 KKD	*$'Hc%TUU% "& r   c                   [        5       nU R                  R                  5       u  p4pVUR                  / SQ5        UR	                  5          [
        R                  " 5       n/ n[        XE5       GHd  u  pS[        U5       3n[        R                  R                  U	5      nU(       a  [        R                  R                  R                  UR                  5       U R                  [         R"                  S9n[        R                  R                  R                  UR%                  5       U R                  [         R"                  S9nUR'                  U SU SU SUR)                  5        SUR+                  5        S3
5        GO=U	[        R                  R,                  ;   a  [        R                  R,                  U	   n[        R                  R                  R                  UR/                  5       U R                  [         R"                  S9n[        R                  R                  R                  UR1                  5       U R                  [         R"                  S9nUR'                  U SU SU SUR2                   SUR4                   S3
5        GO.[7        U
[8        5      (       at  [        R                  R                  R;                  U
R<                  U R                  [         R"                  S9nS	U
R>                  ;   a  S
nUR'                  U SU 35        O[7        U
[@        5      (       a  [        R                  RC                  5       n[        R                  R                  R;                  U
R                  U R                  S9nUR'                  U SU SU SU
R4                   S35        O[E        SU	 35      eURG                  U5        GMg     URI                  U RK                  5       5        UR'                  SSRM                  U5       S35        SSS5        UR                  / SQ5        [        R                  RC                  5       nURN                  nUR	                  5          UR'                  S[        R                  RP                  RS                  U5       S35        UR	                  5          UR'                  [        R                  RP                  RU                  U5      5        SU 3nUR'                  U SU S35        UR'                  [W        [X        RZ                  5       SU S35        SSS5        SSS5        UR                  / SQ5        UR	                  5          UR'                  S[        R                  RP                  RS                  U5       S35        UR	                  5          UR'                  [        R                  RP                  RU                  U5      5        UR'                  S[W        [X        RZ                  5       S35        SSS5        SSS5        UR                  / SQ5        UR	                  5          UR'                  S5        UR'                  S5        UR'                  S5        UR'                  S[        R                  RC                  5       R\                   S35        UR'                  S U 35        UR'                  S!5        UR'                  S"5        SSS5        U$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN)= f! , (       d  f       U$ = f)#a  
Generates Python code for benchmarking this Triton kernel.
- Creates example inputs (random tensors, constants, sizes).
- Runs the kernel on the current GPU/stream.
- Prints runtime (ms) and throughput (GB/s) using `num_gb`.
Args:
    num_gb (float): The number of gigabytes to use for throughput calculation.
Returns:
    IndentedBuffer: A buffer containing the generated Python benchmark code.
)r   r   zdef get_args():arg_r  z = rand_strided(r  z
, device='z	', dtype=r  rh  r   r   )rK  z = torch.zeros(z*Don't find the buffer or const tensor for r|  r  N)
r   zdef call(args):zwith r  streamz = get_raw_stream(z.run(*args, stream=)r   r   z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r   r   zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z7ms = benchmarker.benchmark(lambda: call(args), device='z
', rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))/rV   r  python_argdefsr  r  r  rF  r  r  rL   rZ  try_get_bufferrR  
size_hintsget_sizerK  r"   r  
get_strider   
get_devicer  	constantsr  rs  devicer  r   r\   r  r   r   r^   r  KeyErrorr  extendr  r*  r  
device_opsdevice_guard
set_devicer   rA   KERNEL_NAMErF  )r  num_gbru  _argdefs	call_argsr  r\  name_cnt	var_namesarg_namearg_sigvar_namer(  r  rs  const_tensorsymval_hintr	  rF  current_devicer  stream_names                         r   codegen_kernel_benchmark%TritonKernel.codegen_kernel_benchmark  s-     !,0II,D,D,F)Y56]]_ (HI%(%>!!$x.!12gg,,X677++66&*&8&8!'!@!@ 7 D
 WW--88(&*&8&8!'!@!@ 9 F
 $$#*$4TF"VHJs~~O_N``ijmjwjwjyizz{| !2!22#$77#4#4X#>L77++66$))+&*&8&8!'!@!@ 7 D
 WW--88$++-&*&8&8!'!@!@ 9 F
 $$#*$4TF"VHJ|ObObNcclmymm  mA  AB  C  11"#''"2"2"<"<&*&8&8!'!@!@ #= #K %4&'$$z[M%BC66WW@@BFGG,,66T5G5G 7 E $$#*OE7*VHIV]VcVcUddef #DXJO    *u &?v T==?@wtyy';&<A>? B 	9:<<>$$]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV !  	JK]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8   c+"9"9:;;XY	 !  	DE]]_N R 01I!''JmJmJoJtJtIuu  A y12=>N   [ _L ! _  ! _ _  sf   O\/A\*<B\?\*2A]?A%\<$]B%] 
\
\'	"\**
\9<
]	]
] 
]/c                    [         R                  " SR                  [        R                  R
                  R                  S5      5      5      $ )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentr  rL   rZ  r  import_get_raw_stream_asr  s    r   imports_for_benchmark_kernel)TritonKernel.imports_for_benchmark_kernel1  s:     F177%%>>?OPQ
 	
r   c                    U R                   (       a  gU R                  (       a  gU R                  (       a  U R                  (       d   egU R                  (       a  gg)Nr#  rS  r  r5  	pointwise)r#  rS  r  rc  r  s    r   _get_heuristicTritonKernel._get_heuristic:  sD    !''*&&(((()""r   c                 z   [         R                  R                  R                  5       [        R
                  [        R                  [        R                  R                  [        R                  [        R                  [        R                  [        R                  [        R                  [        R                  R                  [        R                  R                  [        R                  R                   [        R"                  [        R$                  R&                  S.n [        R(                  (       a  [         R*                  " 5       U S'   [         R,                  R.                  b  SU S'   [        R0                  " 5       (       a  SU S'   [        R2                  (       aL  [        R2                  U S'   [        R4                  U S'   [        R6                  U S'   [        R8                  U S	'   [        R:                  (       a9  [        R:                  U S
'   [        R<                  U S'   [        R>                  U S'   U $ )N)backend_hashassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinr_  r`  $are_deterministic_algorithms_enabledTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions) r   r%   _tritontriton_hash_with_backendr"   r+  r,  r   r-  r.  r/  r0  r1  r2  r3  r4  r5  r_  rL  r`  *write_are_deterministic_algorithms_enabledr6  r  r  r  r8  r9  r:  r;  r<  r=  r>  )inductor_metas    r   inductor_meta_common!TritonKernel.inductor_meta_commonF  s    "KK//HHJ(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44#11.4.A.A.`.`
" <<::< @A ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r   c                  ^-^. [        5       n0 nU R                  R                  5        H  u  pE[        U5      (       a  U R                  (       d  M(  [
        R                  R                  R                  U5      n[        U[        [        R                  45      (       d  SnO[        [        U5      5      nXsU'   M     Uc  UR                  [        5       5        [
        R                  R!                  5       R"                  nUS:X  a  UR                  S5        OUR                  S5        [$        R&                  (       a  UR                  U R)                  5       5        U R*                  R-                  5       u  m-n	m.n	[/        T.5       H  u  p[        U[0        5      (       d  M  [3        [        R4                  UR6                  5      nU[
        R                  R                  R8                  ;   d  Mj  [1        UR:                  [
        R                  R                  R8                  U   5      T.U
'   M     [=        5       nU R>                   GH'  nXR*                  R@                  ;   a(  URC                  U R*                  R@                  U   5        XR*                  RD                  ;   am  U[
        R                  RF                  ;  aO  XRF                  ;  a@  URC                  [3        [H        U R*                  RD                  U   5      RJ                  5        XR*                  RL                  ;   d  M  U R*                  RL                  U   n[        U[N        5      (       a   eURC                  U5        GM*     [Q        T-T.5       HX  u  nn[        U[R        5      (       d  M  URT                  [V        RX                  :X  d  M=  URC                  UR:                  5        MZ     [[        U5      nU R]                  5        H[  n[1        UR^                   S3UR`                  5      nT.Rc                  U5        T-Rc                  [e        UR:                  5      5        M]     U-U.4S jnU Rf                   HY  nURh                  (       a  U Rj                  (       a  M'  URl                  c  M6  U" UR^                  Ro                  5        S35        M[     U Rp                  (       a  U" S	5        U Rr                  (       a  U" S
5        U" S5        [u        T.U Rv                  T-S9nU[x        Rz                  " [
        R                  R!                  5       5      0 [|        R~                  R$                  R                  R                  =(       a7    S[        U R                  5      ;   =(       d    S[        U R                  5      ;   S.n[
        R                  R                  =(       d    [
        R                  R                  nU R                  5       R                  [        U R                  5      [        [        R                  5      UUU R                  U R                  U R                  U R                  U R                  S.
U R                  5       EnU Rr                  (       a  U R                  US
'   [$        R                  (       d  [$        R                  R                  (       a"  U R                  =(       d    U R                  US'   [
        R                  R                  Ri                  5       =(       a    U Rj                  (       + nU R                  n[        U R                  5      S:H  nU(       Ga  U(       Ga  U R                  R                  U R                  5      nUR                  R                  R                  S   nUR                  nUb+  SU;   a%  SU;   a  US   [        US   S5      -  nU[        :  nO,U R                  R                  U5      [        R                  :H  nUR                  R                  R                  n UR                  R                  R                  n![
        R                  R                  R                  U [$        R                  S9[        [
        R                  R                  R                  U![$        R                  S9S5      -  n"U"S:  a  U(       a  [
        R                  R                  R                  U R                  R                  S5      (       aN  [
        R                  R                  R                  U R                  R                  S5      (       a  US::  a  SUS'   U R                  (       a  U R                  US'   U R                  (       a  U R                  US'   U Rp                  (       a  U Rj                  US'   Sn#[$        R&                  (       d  [$        R                  (       a  U R                  5       S -  n#U#b  U#US!'   [$        R&                  (       a  U R                  5       n$U$b  U$US"'   [        T.5      /US#'   U R                  5       US$'   [        T.5       H  n%SUS%   T.U%   R:                  '   M     [$        R                  (       + US&'   UU lw        U R                  U R                  5        U R                  5         U R                  U R                  5        U R                   H%  n&UR                  S'5        UR                  U&5        M'     U R                  (       a5  S(U R                  5        S)U R                  R$                  < S*U< S+U< S,3	n'OU R                  (       aI  U R                  R                  U R                  5      n(S(U R                  5        S-U< S.U( S*U< S+U< S,3n'O_S'n)[        U5      S:X  a  [        [        T.5      5      S/:X  a  S0n)OS1n)S(U R                  5        S-U< S2U) S3U< S+U< S4U GR                    S,3n'UR                  U'5        U=(       d    [        [        GR                  5      n*UR                  S5U* S6S2GR                  S7 T- 5       5       S835        UGR                  5          [$        R                  GR                  (       a  UR                  S9U* S:35        U GR                  U5        U R*                  GR                  5        H  u  n+n,UR                  U+ S;U, 35        M     UR                  U R                  5        [$        R                  GR                  (       a  UR                  S<U* S:35        SSS5        [$        R&                  (       a!  UR                  U GR                  U#5      5        UGR                  5       $ ! , (       d  f       NU= f)=z
Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
metadata, and benchmarking infra.
i    Ncpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                   > [        5       (       a  TR                  [        U 5      5        TR                  [        U SS95        g )NT)is_constexpr)rG   r  rR   rP   )r  argdefsr  s    r   add_constexpr_arg6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg  s2    -//  h!78NN78$?@r   r   rX  RSPLIT_SIZE
NUM_STAGES)
size_dtyperI  ztl.dot)r  r	  r  native_matmul)
	grid_typerP  kernel_namemutated_arg_namesoptimize_memr`  r  num_load	num_storenum_reduction"has_loadstore_with_contiguous_rdimr!   r   rX  r0_rN   )r  g?i   i   
   Tadd_persistent_rblocktiling_scoresr$  r  g    eAkernel_num_gbkernel_flopconfigs
launch_pdlr  enable_fp_fusionr   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r>  c              3  @   #    U  H  oR                  5       v   M     g 7fr   )	full_namer?  s     r   r  .TritonKernel.codegen_kernel.<locals>.<genexpr>  s     *J'Q;;=='r  rl  zpl.enter_scope("rI  r   zpl.exit_scope(")rV   rd  r  rB   rc  rL   rZ  rR  r  r   r  r   r   r5   r   r   r  rF  r"   benchmark_kernelr#  r  r  r  r\   r
   r   r   inv_precomputed_replacementsr   r   	mutationsinput_buffersr  r  removed_buffersrW   
inner_nameoutput_buffersr[   r  r^   	zero_moder_   ZERO_ON_CALLr+  r   r   r  r  rP   ro  rf  r  r   r   rS  r  rk   r  r0   rv  r   r9  r   rO  r   rj  r:  is_inferenceis_backward_get_grid_typer   setrP  rA   DESCRIPTIVE_NAMEr`  r  rT  rU  rV  rC  rsplit_sizer_  rL  r`  rW  rk  r   re  r[  r   rY  memory_stats
persistentmemoryrq  count_per_threadr{   INNER_REDUCTION_RATIO_THRESHOLDget_reduction_hintr1   INNERloopedbytesr  r  statically_known_leqreduction_numelstatically_known_gtr$  r8  estimate_kernel_num_bytesestimate_flopsrg   r-  rh   emulate_precision_castsrQ  r  r  rY  r9  r   r#  r'  ri   r  r  r*  r  r   codegen_static_numelsaliasesr  r   )/r  r   rT  r  r   r  
numel_hintr  device_typer\  rZ  r  r  mutated_argsmutationmutation_argargnamer   sizeargrJ  triton_meta_signaturerQ  rS  rB  
looped_redr[  	two_d_redrs  	dim_statsmem_ops_per_threadr_coalesce_ratiocontiguous_red
looped_mempersistent_memsaved_bytes_ratior  flopsarg_numrt  heuristics_linereduction_hint	tile_hintrQ  oldnewrI  r  s/                                                @@r   codegen_kernelTritonKernel.codegen_kernelu  s    
![[..0MF"6**43H3H))77>Jj3*>?? !	+C
O<	!*v) 1, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	*FA#w'' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL + )3H99222  !8!8!BCII555AGG$;$;;$8$88  )B)B8)LMXX 99333#yy77A%lJ????  . '6  3LGS3--MM%6%C%CC  . 4 l+++-DU3TZZ@GW%NN77<<01 .	A $$D  T%>%>&!2!2!4 5U;< % %%h'##m,l+ 1$"2"2G!
 /&--agg.Q.Q.ST&&--;; RTYY/P8s4<<?P3P'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-( $ 5 5!//
 '')
 ##+/+;+;M-(6#6#6#U#U22 766 >? XX&&335Wd>W>W:W
**$)	)==55dkkBL$//66::1=I!*!;!; )=(]* $1#7#mC>PRS:T#T !15T!T MM44]C$**+ 
 &,,3399J)44;;AAN ! 0 0 : :V%D%D !; !  **"V-L-L +  	! "S("GG$$99MM115  GG$$88MM114  '",9=56-1-?-?M/*##373K3KM/0%%484M4MM01""f&>&>335;F!17o.""'')E /4m,"+I"6!7I$($<$<$>L! +95G@AK$Yw%7%<%<= 6.4.L.L*L&'&dii(#++FNN2KK , #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==d>P>PQN#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$:c+"9"9:;-q*J'*J!J K2N	
 [[]}}---!1+bAB&&t, II--/S#c#/0 0KK		"}}---R@A  ""KK55f=>}} ]s   Cv>>
wc                   [         R                  R                  R                  U 5      n [	        U [
        R                  [        45      (       a  [        U 5      n[        U5      nU$ Sn[         R                  R                  R                  X5      (       dI  US:  a  [        SU  35      eUS-  n[         R                  R                  R                  X5      (       d  MI  U$ )Nr!   i @  z!Failed to find static RBLOCK for )rL   rZ  rR  simplifyr   r   r   r  r5   r|  r  )r  r  s     r   r  #TritonKernel._get_persistent_RBLOCK  s    !!**62fu}}c233f+C!#&C 
 Cgg&&;;FHH?$'H%QRRq gg&&;;FHH
 Jr   c                P     [         R                  U 5        g! [         a     gf = f)NTF)r  r  r  )r  s    r   has_persistent_RBLOCK"TritonKernel.has_persistent_RBLOCK  s*    	//7 		s    
%%c                R   S
S jnU R                    GH  nUR                  (       a  U R                  (       ai  [        R                  R
                  R                  UR                  5      nU" U5      (       a)  UR                  UR                   S[        U5       35        UR                  (       a  U R                  (       a  U R                  (       a1  U R                  U R                  UR                  5      5      nSU S3nO8U R                  UR                  5      nU R                   (       a  [#        US5      nUR                  UR                  R%                  5        SU 35        UR                  S:X  d  GMl  U R&                  (       d  GM  UR                  S5        GM     g	)ay  
We get a small speedup from hard coding numels if they are static.

This code stomps on the passed-in values by writing an constant to the top of the kernel.

In a kernel like:
def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

We would add
xnumel = 4096
r0_numel = 768

After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
knows that its a static numel, as that you just plop a constant into the kernel.
c                B    [        U [        R                  [        45      $ r   )r   r   r   r  r   s    r   is_static_integer=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)r  zBLOCK: tl.constexpr = rX  zXBLOCK: tl.constexpr = 1N)r   r   r   r   )ro  rf  rc  rL   rZ  rR  r  r  r   r   r  r  rS  r:  r  r  rJ  r{   r   r`  )r  rT  r  r   simplified_tree_numelr  r  s          r   r  "TritonKernel.codegen_static_numels  s,   $	: $$D$$(=(=()(8(8(A(A$**(M%$%:;;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC,,!#rl$++"3"3"5!66LSERS{{c!dmmm9:' %r   c                H   [        U R                   Vs/ s H  n[        UR                  (       + 5      PM     sn5      nU R                  (       a  US:X  d   e[
        R                  $ U R                  (       a  US:X  d   e[
        R                  $ US:X  a  [
        R                  $ US:X  aN  [        [        U R                  U R                  5      5      (       a  [
        R                  $ [
        R                  $ US:X  a  [
        R                  $ [!        SU 35      es  snf )NrN   r!   r   z"Unsupported number of dimensions: )r  ro  r  rf  r  r-   MixOrderReductionGridrS  CooperativeReductionGridGrid1Drz   r,  r  Grid2DWithYZOverflowGrid2DGrid3Dr  )r  r   r  s      r   ro  TritonKernel._get_grid_type  s    8H8HI8H***+8HIJ##6M6$:::''6M6$===!V$+++!V3t22D4D4DEFF(===$+++!V$+++=aSABB Js   $Dc                   U R                    H  n[        UR                  [        R                  [        R
                  45      (       a  UR                  nO)[        R                  R                  R                  X5      nUR                  (       a  U R                  (       d  M  UR                  U5        UR                  [        U5      5        M     g r   )ro  r   r  r   r   r   rL   rZ  wrapper_codegenerate_numel_exprrf  rc  r  rF  )r  r   r  	arg_typesr   r   s         r   r  #TritonKernel.add_numel_to_call_args  s    $$D$**u}}ell&CDDzzww++??K$$(=(=(=  &  d, %r   c                |   [         R                  R                  nUR                  5         U R                  R                  5       u  pVpWU R                  XU5        U R                  R                   H  nUR                  U5        M     UR                  UUSUU R                  S9  U(       a  U R                  5         g g )NT)r   r  rQ  )rL   rZ  r  write_triton_header_oncer  r  r  workspace_argsgenerate_workspace_allocationgenerate_kernel_callrQ  deallocate_workspaces)	r  r   r`  deallocate_wswrapperr\  r  r  wss	            r   call_kernelTritonKernel.call_kernel  s     ''&&((*%)YY%=%=%?"a##DY?))**B11"5 + 	$$(( 	% 	
 &&( r   c                   [         R                  R                  nU R                  R	                  5       u  p#pB[        X45       H  u  pV[        U[        5      (       d  M  [         R                  R                  (       a  UR                  SU SU S35        MU  SU S3nUR                  U5        SU S3nUR                  U5        M     g )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rL   rZ  r  r  r  r  r   r]   cpp_wrapperr   )r  r  r\  r  arg_signaturesr  arg_signaturer"  s           r   codegen_nan_checkTritonKernel.codegen_nan_check/  s    ''&&*.))*B*B*D'n"%i"@C-3377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+ #Ar   c                    [        U0 UD6$ r   )r  )r  r  r  s      r   r_  TritonKernel.create_cse_var>  s     $1&11r   c                \   UR                    SU R                  U R                  UR                  5      5       3nUR                  R
                  (       d!  U R                  (       a,  UR                  S:X  a  U R                  R                  U5        g U R                  R                  U5        g )Nr   rX  )r   r:  r  r   rootr  r  r   r  r   rj  )r  r  r"  s      r   codegen_iteration_ranges_entry+TritonKernel.codegen_iteration_ranges_entryA  sy    **SD,@,@,L!M NO ::$":":u||s?R((. II%r   c                >   UR                   c   eU R                  UR                   5      nU R                  nUS:w  a  SU S3OSnU R                  (       a'  U R                  (       a  UR
                  (       a  U S3nSUR                  R                  5        SU U 3$ )Nr  r  r  r   z + rsplit_startztl.arange(0, zBLOCK))r   indexing_size_strr  rS  r  rf  r   r   )r  r  r  r  r@  s        r   r  )TritonKernel.iteration_ranges_ranges_codeL  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr   c                ^    U R                   nU R                  5       nS/U-  nSU SU SU S3$ )NrN   r  r  r  )r  r   )r  r  r  r  r   r  s         r   iteration_ranges_scalar_code)TritonKernel.iteration_ranges_scalar_codeY  sC     &&&&(sTz$r%;-q99r   c                0   UR                   c   eSUR                    S3nU R                  U5      (       a#  SU SUR                   S-    SUR                    S3nUR                  R                  X"5      nU R                  S:w  a  U S	U R                   S3$ U$ )
Nztl.program_id(r  r>  z + tl.program_id(rN   z) * tl.num_programs(rE  r  r  )rx  r  	pid_cacher  r  )r  r  r  pids       r   iteration_ranges_get_pid%TritonKernel.iteration_ranges_get_pida  s    ~~)))u~~.a0 &&u-- cU+ENNQ,>+??STYTbTbSccefCoo!!#+z)U$t//022
r   c                   UR                   S:H  =(       aq    UR                  (       + =(       aY    U R                  (       + =(       aA    [        R                  R
                  R                  UR                  [        5       5      (       + $ r  )	rx  has_zdimrS  rL   rZ  rR  r|  r  r4   )r  r  s     r   r  #TritonKernel.needs_yz_grid_overflowp  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r   c                    U R                   (       a   U R                   UR                  5        S3   $ [        UR                  5          $ )Nr   )r#  r   r2   )r  r   s     r   r"  TritonKernel.max_blockx  s;    $$'7u%=>>//r   c                   U R                   (       a:  [        R                  R                  R	                  UR
                  S5      (       a  gU R                  (       d  gU R                  (       a[  UR                  R                  5        S3U R                  ;   a0  U R                  UR                  R                  5        S3   S:X  a  gOKU R                  (       d:  [        R                  R                  R                  UR
                  S5      (       a  gUR                  (       a-  U R                  (       a  U R                  UR
                  5      nO?UR                  S:X  a  U R                  (       a  SnOU R!                  UR                  5      nUR                  (       a#  U R"                  (       a  X R%                  5       -  n[        R                  R                  R'                  UR
                  U5      (       ae  UR(                  S:g  =(       dO    UR*                  =(       d<    [        R                  R                  R-                  UR
                  [/        5       5      $ g)Nr  Fr   rN   TrX  )rJ  rL   rZ  rR  statically_known_ltr  r=  r#  r   r   r>  r[  rf  r  r  r`  r"  rS  rY  r  rx  r  r|  r4   )r  r   r"  s      r   r,  TritonKernel._has_constant_mask}  s      ww33DJJCC!!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1D E%%ww77

AFF !:!:33DJJ?I[[CDMMIt{{3I!;!;!OO$55I 7788YOO" W==W77##88^EUV r   c                f    U R                   S   nUR                  S:X  d   eU R                  U5      $ )Nr   rX  )ro  r   r,  )r  xtrees     r   r   TritonKernel._has_constant_xmask  s5      #||s"""&&u--r   c                    U R                    H9  nU R                  U5      (       d  M  UR                  UR                   S35        M;     UR                  S5        g )Nr  r   )ro  r,  r]  r   )r  r  r   s      r   r  TritonKernel.filter_masks  sL    $$D&&t,,!!T[[M"67 %
 	&!r   c                    [        [        R                  5      S U R                    Vs/ s H  n[        U   PM     sn$ s  snf r   )rk  r   r   rb  r   )r  r   s     r   get_reduction_prefixes#TritonKernel.get_reduction_prefixes  sG     ]::;<Ud>U>UV
V tV
 	
 
s   ?c                   U R                    Vs/ s H  o"R                  (       d  M  UPM     nnSR                  [        S U 5       5      5      nUR	                  SU R                  U5       35        U R                    Vs/ s H3  nUR                  (       d  M  [        R                  UR                     PM5     nn[        U5      nUR	                  SU R                  U5       35        gs  snf s  snf )zN
Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
r  c              3  >   #    U  H  oR                    S 3v   M     g7f)r  Nr  r  s     r   r  8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>  s     "U_Tkk]%#8_r  z	rnumel = zRBLOCK: tl.constexpr = N)
ro  rf  r*  r+  r   r:  r   r   r   rD   )r  r#  r   reduction_treesr  	rn_blocksr  s          r   rR  %TritonKernel.codegen_reduction_numels  s    
 -1,<,<R,<D@Q@Q4,<RF"U_"UUV	$**V"4!567
 ((
(   1M%%dii0( 	 

 y)/

60B/CDE S

s   C,C,C1!C1c                    U R                  5       nU Vs/ s H  n[        R                  " U U 340 UD6PM      sn$ s  snf )z;
Helper to initialize symbols like rn_numel, rn_base, etc.
)r  r   r   )r  r@  r  rn_prefixesr   s        r   r  #TritonKernel._get_reduction_symbols  sA     113JUV+xx0;F;+VVVs   %=c                    U R                  5       nU R                  SSSS9n[        [        U5      S-
  5       Vs/ s H  n[	        X#S-   S 5      PM     sn[
        R                  " S5      /-   $ s  snf )z
Compute coefficients to convert ND reduction indices to linear indices.
For example:
  rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
r  Tr   rN   N)r  r  rm  r   rD   r   r   )r  r  	rn_numelsr  s       r   _get_reduction_index_coeffs(TritonKernel._get_reduction_index_coeffs  s~     113//PT/U	;@[AQTUAU;V
;VCM)!GI./;V
]]1 	 
s   A0c                8    U R                  5       n[        X!5      $ )z;
Compute linear reduction indices from N dimensional ones.
)r  rC   )r  
multi_indscoeffss      r   r  'TritonKernel._flatten_reduction_indices  s     113,,r   c                   U R                  SSSS9nU R                  SSSS9nU R                  U5      nUR                  SU R                  U5       35        U R                  U5      nUR                  SU R                  U5       35        g)zH
Generates code that converts ND reduction indices into linear indices.
r   Tr   r  z
roffset = z	rindex = N)r  r  r   r  )r  r#  
rn_offsetsrn_indsr}  rindexs         r   r  &TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r   c                >   UR                   nUR                  (       a%  UR                  UR                   SU SU S35        OUR                  cD  UR                  UR                   SU R                  U5       35        UR                  U S35        OUR                  b  U SU R                  U5       3nOU R                  X S35      nU R                  (       d  UR                  5        S3OSnUR                  U SU R                  U5       S	U 3UR                   SU 3/5        U R                  U5      (       a!  UR                  U R                  U5      5        g US
:X  a  U R                  (       d%  UR                  U SUR                   SU S35        g g )Nr   z	offset + r  z
offset = 0r   r   rL  z	offset = r  rX  r  r  r  )r   r  r   r   rx  r  r   r  r  r   r  r  r,  create_constant_mask)r  r  rT  rX  r"  
block_sizes         r   r  ,TritonKernel.iteration_ranges_codegen_header  s    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP88#VM ,0+C+C1779+U#  OOc4#@#@#G"HJ<Xzzl#dV, ""5))NN444U;<s(t77NNaS

|3qc?@  8r   )rM  rO  rN  r  rP  rG  rI  r{  r   r#  rW  r9  rK  rc  r>  r  r=  rE  rJ  rC  rD  rB  rA  rz  rX  r$  rQ  )r   TNNF)
rY  zdict[str, sympy.Expr]r#  zOptional[FixedTritonConfig]rK  zOptional[int]r>  r   r   r   r/  r  r  r   r   r0  )r  r   r  z Optional[Union[str, tuple[str]]]rG  z!Optional[TMACompatibilityChecker])r   )r   r   r   r   rI  z/Union[BlockPtrOptions, TensorDescriptorOptions]r   ztuple[str, str])r   r   r  r   r   r   r   r   )rT  rV   )r   r   ru  rx  )r   r   r  r   r   )
r   r   r  r   r  rT   r  rK   r   r   r  )r  Optional[CSEVariable]NN)r  rT   r  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  rT   r  r  r  r   r  z Optional[tuple[str, sympy.Expr]]r  r	  r   rT   )r   ztuple[str, BlockShapeType])r  rT   r  r  r   rT   )
r  r  r  r  r   rJ   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r	  )r  r  r  )r   r   r  r   r  rT   )r  tuple[CSEVariable, ...]r  tuple[torch.dtype, ...]r   r   )r  r	  r&  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  r	  r   r	  )
r  r	  r  r	  r  r   r  r   r   r	  )r   r}  )r  zOptional[float]r   rV   )r   z type[triton_heuristics.GridExpr]r$  )r   r   r`  zOptional[IRNode]r  r   )r   r  )r  rb   )r  rc   r   r   )r  rc   r  r	   r   r   )r  rc   r   r   )r   r   r   r  )r   rc   r   r   )r  r  r   r   )r#  rV   r   r   )r@  r   r   zlist[sympy.Symbol]r  )r  rY  r   r   )r  rc   rT  rV   r   r   )`r   r   r   r   r   r8  rv  r   r  r:  r  r  rk  r;  r  r)  re  r1  rk  rn  rr  rT  rV  rU  r  r  r  r  rI  r  r  r$  r  rE  rM  r-  rI  rP  rY  rv  rV  r  r  r  r  r  r  r  r  r5  r;  r  r  r  r  rY  r  rf  r  r  r  r  r  r  r  r#  r'  rC  r  r  r  r  ro  r  r  r  r_  r  r  r  r  r  r"  r,  r  r  r;   r  rR  r  r  r  r  r  r   r  r  s   @r   r  r  	  s    &I%%).E&.O$;!JN7N
 48'+ %39%39
 239 %39 39 
39 39j : :6 
 

"

#J*:0


 " " 8<GKH
H
 5	H
 $EH
^ f'f' f' B	f' 
f'PX:MM M 	M
 M4 *H8J
 :?-<1  "

:H
xv SWZZ *Z3>ZFOZ	ZxI
& 480499 C9 &	9
 $9 9 19 .9 
9v.
9
(
1<
	
(ii i &	i
 ;i 
5iV&DO&"
0F
P#0&!
	B>> > 	>@>S1>S;R>S	>S@"'"
" (" 
!"BD"'D" (D" 	D"
 D" 
!D"L$m%^4||

 , ,\AF
     (;TC$
- OS))/)GK),,2	&J:(:14:	:
0
/b.
" 
 
F$W 
 
-? A(A0>A	A Ar   r  c            
      ~  ^  \ rS rSr% \rS\S'   \" \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                   /5      rSU 4S jjr\SS j5       rSS jrS rSSS jjr S   SS	 jjr        SS
 jr        SS jrS rSrU =r$ )TritonSchedulingi  z	type[Any]kernel_typec                   > [         TU ]  U5        Ub  [        US5      (       d  g UR                   H+  n[	        U[
        [        45      (       d  M   [        Ul        M-     g )Nr^  )	r  r  r   r^  r   r9   r7   debug_triton_codedebug_device_str)r  rC  r`  r  s      r   r  TritonScheduling.__init__.  sN    #GIw$?$?OOD$0B CDD(9% $r   c                    [         R                  R                  (       d  [         R                  R                  (       a'  [	        / U R
                  Q[        R                  P5      $ U R
                  $ r   )r"   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrQ   REDUCE_TO_SINGLE_ELEMENT)r   r	  s     r   get_backend_features%TritonScheduling.get_backend_features6  sR     MM00}}99P#&&P(O(OP  ###r   c                  ^
 [         R                  R                  n[        X5      u  pEU(       a  UR	                  U5        [
        R                  (       a  SSKJnJ	m
  [        U
4S jU 5       5      (       da  U Vs/ s H%  n[        Xv5      (       d  M  UR                  5       PM'     nnUR	                  UR                   SSR                  U5       35        U(       a  [        UU5      n	UR!                  X)5        g g s  snf )Nr   )r6   ForeachKernelSchedulerNodec              3  <   >#    U  H  n[        UT5      v   M     g 7fr   )r   )r   r  r	  s     r   r  3TritonScheduling.codegen_comment.<locals>.<genexpr>M  s      CPa
1899=s   z Fused node name list: r  )rL   rZ  r  r?   make_commentr"   debug_fusiontorch._inductor.schedulerr6   r	  rz   r   get_namecommentr*  r+   write_provenance_debug_handle)r  node_schedulerQ  r  origins_detailed_originsr6   r  
node_namesdebug_handler	  s             @r   codegen_comment TritonScheduling.codegen_commentA  s    ''&&%8%P"  )
  CP   +*!!7 !AJJL*  
 $$''>tyy?T>UV BL 11+L s   <C?C?c                r   [         R                  R                  nXR                  ;   a  UR                  U   nU$ [        R
                  R                  (       a$  [        U[        R
                  R                  5      OSn[        U5      S S nSR                  SXvUR                  5       /5      n[        R                  R                  (       a  [        R                  R                   SU 3nXTR                  U'   [        R
                  R                  (       a  UOSnUR                  [        [         R"                  5      U5      nUR                  [        [         R$                  5      U5      nUR                  SS5      n['        [)        UR+                  5       5      S5      u  pn[-        5       n[.        R1                  5       (       a  [.        R                  X5        UR3                  S	U< S
35        UR5                  USS9  [         R                  R7                  5       nUR3                  SUR8                   S35        SU 3n[;        X$5      u  nnUSU-   S-   U-   -  nUR=                  X\R?                  5       U5        [@        RB                  " S5      (       a  [@        RD                  " X[U5        U$ )Nr   r   r\  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr\  z''', device_str='z')z# kernel path: r   kernel_metadata)#rL   rZ  r  src_to_kernelr"   r   descriptive_namesr>   rM   r*  next_kernel_suffixaot_inductormodel_name_for_generated_filesunique_kernel_namesr   r   rA   rq  r  r(   r'   r]  rV   async_compileuse_process_poolr   r   r  rF  r?   define_kernelr   r$   is_metric_table_enabledlog_kernel_metadata)r  src_coder!	  r   r  rQ  
fused_namekernel_category	subs_name	_basenamer\  kernel_pathcompile_wrapperr  metadata_commentr"	  detailed_originss                    r   r5	  TritonScheduling.define_kernelb  s]   ''&&,,,!//9Kt m ==22 &mV]]5T5TU 
 AJ2ANO((?8R8R8TUK ""AA "(!4!4!S!S TTUVaUbc /:!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%I+,.O--// $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M(S%G%w 58H HH!!5579I ../@AA++KhOr   c                    U R                  USS9n[        R                  " U5      nU R                  XB[	        S U 5       5      S9$ )zk
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
T)rd  c              3  @   #    U  H  oR                  5       v   M     g 7fr   r	  r   r  s     r   r  9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s     :WQVA::<<QVr  )r$	  )generate_kernel_code_from_nodesr)   rV  benchmark_codegened_moduler   )r  r^  n_spills_thresholdr8	  r  s        r   benchmark_fused_nodes&TritonScheduling.benchmark_fused_nodes  sS    
 77PT7Ux(..
:WQV:W0W / 
 	
r   c           	     f  ^^
^^^^ [        [        R                  R                  5      n[	        5          UR                  [        R                  R                  5       5         SmU4S jmUU4S jnU4S jnUb  UO[        S/5      n[        R                  SUTR                  5        U" 5       mTb   TTR                  4sSSS5        sSSS5        $ TR                  5       m
TR                  mTR                  m T" TR                  " T
6 S   5        TR(                  n[+        U5      S
:X  d   eUS   R,                  U:  a  ['        S	5      mOx[        R                  R                  5       n	[.        R0                  " U
UU4S jU	S9m[+        TR2                  5      S:  a&  T[.        R0                  " U
U4S j[5        U	5      S9-
  m[        R                  SUT5        U" 5         TTR                  4sSSS5        sSSS5        $ ! [         as  n[         R"                  R$                  (       a  e [        R                  SUU5        ['        S	5      mU" 5         TTR                  4s SnAsSSS5        sSSS5        $ SnAff = f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)z$Benchmark an already compiled moduleNc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ Nr   z.kernel_perf__file__ospathsplitextr  s   r   cache_file_pathDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s6    ||///ww''5a8>IIr   c                 >   > T" 5       n [        U [        T5      5        g r   r*   r   )rR	  rU	  mss    r   store_cache@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s    &(T3r7+r   c                    > T" 5       n [         R                  R                  U 5      (       a.  [        U 5       n[	        UR                  5       5      sS S S 5        $ g ! , (       d  f       g = fr   )rQ	  rR	  existsopenr1  readrR	  fdrU	  s     r   
load_cache?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  sJ    &(77>>$''dr$RWWY/ $ $s   A
A*rM  %kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrN   c                 4   > T" TR                   " T 6 S   5      $ r  
clone_argsr  callwrapped_jit_functions   r   r  =TritonScheduling.benchmark_codegened_module.<locals>.<lambda>      D!5!@!@$!G!JKr   r	  c                 "   > TR                   " T 6 $ r   rg	  r  rk	  s   r   r  rl	    s     4 ? ? Fr   z+The fused kernel for %s took %.3f ms to run)r   rL   rZ  r  r   r	  r  r   r  r  rP	  get_argsrj	  r)	  rh	  	Exceptionr"   r   .disallow_failing_autotune_kernels_TESTING_ONLYr1  	launchersr   n_spillsr.   	benchmarkrR  r   )r  r  rI	  r$	  device_interfacerZ	  rb	  r'  rt	  r	  r  rU	  rj	  rY	  rk	  s    `        @@@@@r   rH	  +TritonScheduling.benchmark_codegened_module  sK    4AGG4G4GH ##AGG$G$G$IJBJ, )4
*i[:Q  II7
 B~3<<'; KJ ! @ <<>D88D#&;; ()44d;A>? -66Iy>Q&&& |$$'995\<<> !**K! +==>Bk33F"6{ B
 II=
 Ms||#a KJ ! L  
(==OO		@
 5\3<<''_ KJ ! L
(K KJJ !  sh   .J""AJ	J"(J=HCJ4	J"
JAI?'J(J,	J"?JJ
J	J""
J0c                   UR                  S5      nU=(       a     [        S UR                  5        5       5      nU R                  nU(       a  SSKJn  UnU(       a  SUS'   UR                  S5      (       a
  SUS	'   SUS'   [        R                  UR                  5      (       d  UR                  S	5      (       a   eSUS	'   [        R                  R                  XaX#5      nU" U0 UD6nU R                  XU5      $ )
Nr  c              3  @   #    U  H  oR                  5       v   M     g 7fr   )is_split_scan)r   r`  s     r   r  9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
-NT  -Nr  rN   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)contains_oprz   scheduler_nodesr	  triton_split_scanr}	  r  r  r}  r  rL   rq  triton_kernel_kwargsadd_multi_kernel_choices)	r  kernel_featureskernel_argskernel_kwargsis_scanr{	  r	  r}	  r   s	            r   create_kernel_choices&TritonScheduling.create_kernel_choices  s    "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v..=AM9:>CM:;11/2Q2QRR$(()HIIII=BM9:		66+
 k;];,,V-PPr   c           	     $   U/n[         R                  R                  (       d  U$ UR                  =(       a    UR	                  S5      (       + nUR
                  =(       a    UR	                  S5      (       + nU(       a%  UR                  U R                  " U0 UDSS0D65        U(       a  UR                  R                  n[        R                  R                  R                  US5      (       ae  UR                  U R                  " U0 UDSS0D6=n5        U(       a7  UR                  (       a&  UR                  U R                  " U0 UDSSS.D65        [        U5      S:  a-  USS   H  n	UR                  U	l        M     UR!                  S S9  U$ )	Nr	  r~	  Fi   )r~	  r	  rN   c                    U R                   $ r   )r  )r/  s    r   r  ;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>\  s
    q'='=r   r  )r"   r   multi_kernelr  r  rS  r  r	  re  r}  rL   rZ  rR  r|  r   must_keep_buffersr  )
r  r   r	  r	  kernelsoptional_persistentoptional_cooperativer  rk  kernel2s
             r   r	  )TritonScheduling.add_multi_kernel_choices*  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN   # 38  __44Fww44VUCC!--$' 8= E '5+E+ENN(((+ <A:?	 w<!"12;,2,D,D) ' LL=L>r   c                *  ^^^^^^^ U4S jmU4S jnUUU4S jnS/ pTSn[         R                  R                  n[        U5      [         R                  l        [         R                  R                  n[        U5      [         R                  l        [
        R                  S:  n	[
        R                  S:  n
U R                  USU	U
SS9nU GH  u  pnU Vs/ s H  oR                  5       PM     nnU VVs/ s H  nU  H  nUR                  5       PM     M     nnnUR                  [        [        R                  5      S5      n[        R                   " U5      m["        R%                  S	UTR&                  5        U" 5       u  mmTb'  UT-  nUT-  nUR)                  TR&                  5        M  TR+                  5       mTR,                  mTR.                  mT" TR0                  " T6 S   5        TR2                  n[5        U5      S
:X  d   eUS   R6                  S:  a  [9        S5      =mmOS[         R                  R;                  5       n[<        R>                  " UUU4S jUS9m[<        R>                  " UU4S jUS9m["        R%                  S[        S U 5       5      TT5        U" 5         UT-  nUT-  nUR)                  TR&                  5        GM     U[         R                  l        U[         R                  l        XFU4$ s  snf s  snnf )Nc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ rN	  rO	  rT	  s   r   rU	  @TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathd  s6    <<+++77##CLL1!4~EEr   c                   > T" 5       n [         R                  R                  U 5      (       aC  [        U 5       n[	        S UR                  5       R                  5        5       5      sS S S 5        $ g! , (       d  f       g= f)Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   )r1  )r   r'  s     r   r  NTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>l  s      E3Daq3Drj  r	  )rQ	  rR	  r]	  r^	  r   r_	  splitr`	  s     r   rb	  ;TritonScheduling.benchmark_combo_kernel.<locals>.load_cacheh  sW    "$Dww~~d##$Z2  E2779??3D EE  Z  Zs   .A11
A?c                 \   > T" 5       n [        U [        T5      S-   [        T5      -   5        g )NrD  rX	  )rR	  rU	  rY	  ms_clones    r   rZ	  <TritonScheduling.benchmark_combo_kernel.<locals>.store_cacheo  s&    "$Ds2w}s8}<=r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coder)	  rd	  rN   re	  c                 4   > T" TR                   " T 6 S   5      $ r  rg	  ri	  s   r   r  9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  rm	  r   rn	  c                 (   > TR                   " T 6 S   $ r  rg	  rp	  s   r   r  r	    s    0;;TB1Er   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  @   #    U  H  oR                  5       v   M     g 7fr   rD	  rE	  s     r   r  :TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>  s     <A::<<r  ) rL   rZ  rh  r   inplaced_to_remover"   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr	  r   r   rA   r  r)   rV  r  r  rP	  r  rq	  rj	  r)	  rh	  rt	  r   ru	  r1  r  r.   rv	  )r  	node_listrb	  rZ	  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr	  r	  kernel_code_listr8	  r\  
node_groupr`  fused_node_listsr^  r  namesrt	  r	  r  rU	  rj	  r  rY	  r	  rk	  s                         @@@@@@@r   benchmark_combo_kernel'TritonScheduling.benchmark_combo_kernel_  s   
	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8#H=GHZT 0ZH/?O/?eAQZZ\\/?EO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  %00$7:;,66Iy>Q&&&|$$q( %e,X<<> !**K! '00E!
 IIV<<<	 MNHh&NS\\*k (8l #7%<"22o  IOs   L
7$Lr   )rC  zOptional[Scheduler]r   r   )r	  ztorch.devicer   )rx   )r   tuple[float, str])rx   N)r$	  zOptional[OrderedSet[str]]r   r	  )r	  rs   r	  	list[Any]r	  rx  r   list[TritonKernel])r   r  r	  r	  r	  rx  r   r	  )r   r   r   r   r  r	  r   r   rQ   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONr	  r  r   r	  r&	  r5	  rJ	  rH	  r	  r	  r	  r   r  r  s   @r   r
	  r
	    s   )K)!""$$**44++**		
: $ $MB=~	
 RVW$5NW$	W$r#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3j]3 ]3r   r
	  c                4   / nU R                  5       nUb!  [        U[        R                  5      (       d   eU(       a1  UR                  c$  UR                  U R                  5        S35        U$ SSKJn  U R                  5       nUc   eU R                  R                  U5      n[        U[        U45      (       d   S[        U5       35       e[        R                  R!                  U5         ["        R$                  nUR'                  U R)                  5       5      R+                  5       nU["        l        S S S 5        UR                  U R                  5        S35        UR                  [,        R.                  " WS5      5        U$ ! , (       d  f       NX= f)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder   r#   MultiTemplateBuffermake_kernel_renderr  r	  0torch._inductor.codegen.cuda_combined_schedulingr	  r  rC  get_backendrf   rF  rL   rZ  set_current_devicer$   generated_kernel_countrG	  r	  r]  r   r  )r`  linesmulti_templater	  r	  backendold_generated_kernel_counttriton_codes           r   r	  r	    s_   E++-N!Z@V@V%W%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KLL 	
klpqxlykz{	
L WW''/ *1)G)G&!AA eg  .HG* 0 	(67X__[&9:L 0/s   .A	F		
F)r   r  r0  )r  r   r  r9  r  r9  r   r   r	  )r  r  r   r  )r  r  r   r   )r   zUnion[CSEVariable, Any]r   r   )r   ro   r  )r   r   r   zCallable[[_T], _T])r`  r6   r   r}  )
__future__r   r  r  r  r;  r  loggingr&  r  rQ	  r   abcr   collections.abcr   r   r   r   typingr	   r
   r   r   r   r   r   sympy.printing.precedencer   r   torch._loggingtorch.utils._pytreer%   _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   r   r   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr    r   r"   r#   r$   r3	  r&   	codecacher'   r(   r)   r*   r  r+   ops_handlerr,   runtimer-   runtime.benchmarkingr.   runtime.hintsr/   r0   r1   r2   r3   runtime.runtime_utilsr4   r5   rC  r6   r7   r8   r9   shape_propagationr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   virtualizedrI   r  rJ   rK   rL   wrapper_benchmarkrM   block_analysisrO   commonrP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   simdr`   ra   rb   rc   rd   re   rf   triton_utilsrg   rh   ri   rj   rk   r  rl   typesrm   r  ro   %torch.fx.experimental.symbolic_shapesrp   rq   rr   simd_kernel_featuresrs   rt   	getLoggerr   r  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrw  r   r   r   r   r   r   r  r  r3  r  r  r  r  r6  r  r  r  r  r  r  r  r  r  r  r  _initialize_pointwise_overridesr8  r{  r4  r  r  r   r   r  r  r  r
	  r	  r   r   r   <module>r	     s   "         	   8 8  E E  0   $ $ C < 0 / K K  Y X 4 ) ) ( F F ; ( ' .  D W W 5     C B B /    $    %  L>&8	B!00<H~~//*E^^--hA
 #$ ,	6 6  4 $ 4% %PQ, Q,h 
 
 
B ~ ~ ~B
 ?4 ? ?8 =, = =@++/+<P++>pQM pQf 	3
&8
;P 8(.bx
&k x
&v  / / 9H"O H"V$+ $+N N
 N
 N
b! !H # # #
%uS%S/-A'BBC 
 j j jZg5A:/0 g5ATk]3~ ]3@r   