
    ȅi@                    t   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJr  S SK	J
r
  S SKJrJrJrJrJr  S SKrS SKrS SKrSSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSK J!r!J"r"  SSK#J$r$  SSK"J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/J0r0J1r1J2r2J3r3  SSK4J5r6J7r7  SSK8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArA  SSKBJCrC  SSKDJErE  SSKFJGrGJHrHJIrI  \(       a  S SKJJKrKJLrL  SSK)JMrMJNrN  SSKOJPrP  \R                  " \R5      rSS rT " S S\U5      rV " S  S!\?5      rW\W" 5       R                  rY\?" 5       R                  rZ\R                  S"\R                  S#\R                  S$\R                  S%\R                  S&\R                  S'\R                  S(\R                  S)\R                  S*\R                  S+\R                  S,\R                  S-\R                  S.0rhS/ riS0 rj " S1 S2\>5      rk\kR                  S35         " S4 S5\:5      rm\R                   " S6 S75      5       roS8 rpS9 rq " S: S;\H5      rr " S< S=\I5      rsg)>    )annotationsN)defaultdict)inf)AnycastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)CallableSequence)ReductionType	StoreMode)BlockShapeTypec                >   [        U [        5      (       aZ  SU s=::  a  S::  dM  O  [        R                  " [        R                  5      nXR
                  :X  a  gXR                  :X  a  gSU < S3$ [        U [        5      (       a  S[        U 5       S3$ [        U 5      $ )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr0   repr)valinfos     X/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/halide.pyhalide_constantrF   >   s    #s[C%E:%E{{5;;'((?%((?%q!!#us+,A..9    c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )UnsupportedK   c                *   > [         TU ]  SU 35        g )Nz!halide backend does not support: )super__init__)selfthing	__class__s     rE   rM   Unsupported.__init__L   s    <UGDErG    returnNone)__name__
__module____qualname____firstlineno__rM   __static_attributes____classcell__rP   s   @rE   rI   rI   K   s    F FrG   rI   c                     ^  \ rS rSr\S 5       r\S 5       rS rS rS r	\	r
S r\rS rS	 rS
 rS rS rS rS rS rS rS rS rS rS rS rS rS rU 4S jrS r\rS r S r!Sr"U =r#$ )HalidePrinterP   c                D    S[         R                  R                   SU  S3$ )Nhl.cast(, r9   )r#   kernelindex_dtypeexprs    rE   
cast_indexHalidePrinter.cast_indexQ   s"    !((../r$q99rG   c                    SU  S3$ )Nhl.cast(hl.Float(32), r9   rR   re   s    rE   
cast_floatHalidePrinter.cast_floatU   s    'vQ//rG   c                    SU S3$ )Nhl.f32(r9   rR   rN   rf   s     rE   _print_FloatHalidePrinter._print_FloatY   s    a  rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   rn   r   r9   lenargs_printro   s     rE   _print_ToFloatHalidePrinter._print_ToFloat\   9    499~"""TYYq\23155rG   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr$   	hl.floor(r   r9   rt   ru   rg   rv   ro   s     rE   _print_floorHalidePrinter._print_floor`   B    499~"""4;;tyy|+D*EQGHHrG   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr$   	hl.trunc(r   r9   r|   ro   s     rE   _print_TruncHalidePrinter._print_Truncf   r   rG   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr$   hl.ceil(r   r9   r|   ro   s     rE   _print_ceilingHalidePrinter._print_ceilingl   sB    499~"""$++diil*C)DAFGGrG   c                J    SU R                  U R                  U5      5       S3$ Nzhl.sqrt(r9   )rk   rv   ro   s     rE   _helper_sqrtHalidePrinter._helper_sqrtp   s$    $//$++d*;<=Q??rG   c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )Nr   r$   r   
hl.select(rb   r9   )doprintru   )rN   rf   cpqs        rE   _print_WhereHalidePrinter._print_Wheres   s_    LL1&LL1&LL1&A3b2aS**rG   c                n   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  nU R                  [        R                  " UR                  S U 6 5      nU R                  [        R                  " UR                  US  6 5      nSU SU S3$ )Nr$   r   r   hl.min(rb   r9   )rt   ru   rv   sympyMinrN   rf   midabs        rE   
_print_MinHalidePrinter._print_Miny   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rG   c                n   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  nU R                  [        R                  " UR                  S U 6 5      nU R                  [        R                  " UR                  US  6 5      nSU SU S3$ )Nr$   r   r   hl.max(rb   r9   )rt   ru   rv   r   Maxr   s        rE   
_print_MaxHalidePrinter._print_Max   s    tyy>Q;;tyy|,,$))n!KK		499Tc?34KK		499ST?342aS""rG   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr$   hl.abs(r   r9   r|   ro   s     rE   
_print_AbsHalidePrinter._print_Abs   sB    499~"""TYYq\)B(C1EFFrG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.cos(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_cos&HalidePrinter._print_OpaqueUnaryFn_cos   ry   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.cosh(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_cosh'HalidePrinter._print_OpaqueUnaryFn_cosh   9    499~"""$++diil34A66rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.acos(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_acos'HalidePrinter._print_OpaqueUnaryFn_acos   r   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.sin(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_sin&HalidePrinter._print_OpaqueUnaryFn_sin   ry   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.sinh(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_sinh'HalidePrinter._print_OpaqueUnaryFn_sinh   r   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.asin(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_asin'HalidePrinter._print_OpaqueUnaryFn_asin   r   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.tan(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_tan&HalidePrinter._print_OpaqueUnaryFn_tan   ry   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.tanh(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_tanh'HalidePrinter._print_OpaqueUnaryFn_tanh   r   rG   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr$   hl.atan(r   r9   rs   ro   s     rE   _print_OpaqueUnaryFn_atan'HalidePrinter._print_OpaqueUnaryFn_atan   r   rG   c                    [        S5      eNlog2NotImplementedErrorro   s     rE   _print_OpaqueUnaryFn_log2'HalidePrinter._print_OpaqueUnaryFn_log2   s    !&))rG   c                  > UR                   (       a  [        TU ]	  U5      $ UR                  u  p#U R	                  U R                  U5      5      nU R	                  U R                  U5      5      nU R                  SU SU S35      $ )Nr{   z / r9   )
is_integerrL   _print_FloorDivru   rk   r   rg   )rN   rf   xdivrP   s       rE   r   HalidePrinter._print_FloorDiv   sp    ??7*400OODLLO,oodll3/01#SQ788rG   c                    [        UR                  5      S:X  d   eU R                  SU R                  UR                  S   5       S35      $ )Nr$   	hl.round(r   r9   r|   ro   s     rE   _print_RoundHalidePrinter._print_Round   r   rG   c                0    UR                   u  p#SU SU S3$ )N() / (z+hl.f32(0)))ru   )rN   rf   r   r   s       rE   _print_IntTrueDivHalidePrinter._print_IntTrueDiv   s"    yy1#U1#[))rG   c                    UR                   u  p#U R                  U5      n[        U5      nSSU* -  < SU SSU-  < S3$ )Nrn   g      $@z)*hl.round((z	)*hl.f32()))ru   rv   r;   )rN   rf   rC   ns       rE   _print_RoundDecimal!HalidePrinter._print_RoundDecimal   sJ    kk#F1"(SE47+RPPrG   rR   )$rV   rW   rX   rY   staticmethodrg   rk   rp   rw   r}   _print_FloorToIntr   _print_TruncToIntr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rZ   r[   r\   s   @rE   r^   r^   P   s    : : 0 0!6I %I %H@+##G677677677*9I %*
Q QrG   r^   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                    [         U    $ N)_halide_typedtypes    rE   halide_typer      s    rG   c                
   [        U 5      (       a5  U R                  (       a$  U [        R                  :w  a  [        R                  n U [        R
                  [        R                  4;   a  [        R                  n [        U 5      $ r   )	r   	is_signedr<   r>   int32float16bfloat16float32r   r   s    rE   halide_acc_typer      sP    5??u7K//urG   c                  v   \ rS rSr\  SG   SHS jj5       r\SIS j5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S 5       r"\S  5       r#\S! 5       r$\S" 5       r%\S# 5       r&\S$ 5       r'\S% 5       r(\S& 5       r)\S' 5       r*\S( 5       r+\S) 5       r,\S* 5       r-\S+ 5       r.\S, 5       r/\S- 5       r0\S. 5       r1\S/ 5       r2\S0 5       r3\S1 5       r4\S2 5       r5\S3 5       r6\S4 5       r7\S5 5       r8\S6 5       r9\S7 5       r:\S8 5       r;\S9 5       r<\S: 5       r=\S; 5       r>\S< 5       r?\S= 5       r@\S> 5       rA\S? 5       rB\SJS@ j5       rC\SA 5       rD\SB 5       rE\SC 5       rF\SD 5       rG\          SKSE j5       rHSFrIg)LHalideOverrides   Nc                Z    U[         R                  :X  a  SU  S3$ S[        U5       SU  S3$ )Nr   z != 0)ra   rb   r9   )r<   boolr   )r   r   	src_dtypeuse_compute_typess       rE   to_dtypeHalideOverrides.to_dtype   s9     EJJqc= +e,-Rs!44rG   c                    U[         R                  [         R                  4;   a  S[        U5       SU  S3n S[        U5       SU  S3nU[         R                  [         R                  4;   a  SU S3nU$ )Nra   rb   r9   zhl.reinterpret(rj   )r<   r   r   r   )r   r   r   lines       rE   to_dtype_bitcast HalideOverrides.to_dtype_bitcast   ss    77;y12"QCq9A U!3 4Bqc;U]]ENN33+D63DrG   c                8    U R                  [        U5      U5      $ r   )r   rF   )clsvaluer   s      rE   constantHalideOverrides.constant  s    ||OE2E::rG   c                    SU  S3$ )Nr   r9   rR   r   s    rE   absHalideOverrides.abs      1~rG   c                \    [        U S5      (       d  SU  S3$ SU  SU R                   SU  S3$ )Nnamezhl.exp(r9   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr  r  s    rE   expHalideOverrides.exp  s@    q&!!QCq>!3A3fQVVHDefgehhijjrG   c                    SU  S3$ r   rR   r  s    rE   sqrtHalideOverrides.sqrt      !ArG   c                    [        U S5      (       d	  SU  SU S3$ SU R                   SU S3nSU  SU S	U  S
U  SU SU R                   SU  SU S3$ )Nr  r   rb   r9   ra   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r  r   r   s     rE   minimumHalideOverrides.minimum       q&!!QCr!A&&qvvhis!,QCq<s#aS1#U166(JijkillnopnqqrssrG   c                    [        U S5      (       d	  SU  SU S3$ SU R                   SU S3nSU  SU S	U  S
U  SU SU R                   SU  SU S3$ )Nr  r   rb   r9   ra   r  r  >r  r  r  z.type().is_float() else hl.max(r  r   s     rE   maximumHalideOverrides.maximum"  r#  rG   c                b    [        US5      (       a  SUR                   SU S3nSU  SU SU S3$ )Nr  ra   r  r9   r   rb   r  )r   r   r   s      rE   whereHalideOverrides.where*  sB    1f166()A3a0AA3b2aS**rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   cosHalideOverrides.cos0  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   sinHalideOverrides.sin4  r  rG   c                    [        S5      e)NlgammarI   r  s    rE   r2  HalideOverrides.lgamma8      (##rG   c                    SU  S3$ )Nzhl.erf(r9   rR   r  s    rE   erfHalideOverrides.erf<  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   coshHalideOverrides.cosh@  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   sinhHalideOverrides.sinhD  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   acosHalideOverrides.acosH  r  rG   c                    SU  S3$ )Nz	hl.acosh(r9   rR   r  s    rE   acoshHalideOverrides.acoshL      1#QrG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   asinHalideOverrides.asinP  r  rG   c                    SU  S3$ )Nz	hl.asinh(r9   rR   r  s    rE   asinhHalideOverrides.asinhT  rE  rG   c                    SU  SU S3$ )Nz	hl.atan2(rb   r9   rR   r   ys     rE   atan2HalideOverrides.atan2X      1#Rs!$$rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   atanHalideOverrides.atan\  r  rG   c                    SU  S3$ )Nz	hl.atanh(r9   rR   r  s    rE   atanhHalideOverrides.atanh`  rE  rG   c                    [        S5      e)Ncopysignr3  rM  s     rE   rY  HalideOverrides.copysignd  s    *%%rG   c                    [        S5      e)Nerfinvr3  r  s    rE   r\  HalideOverrides.erfinvh  r5  rG   c                    SU  SU S3$ )Nz	hl.hypot(rb   r9   rR   rM  s     rE   hypotHalideOverrides.hypotl  rQ  rG   c                    [        S5      e)N	nextafterr3  rM  s     rE   rb  HalideOverrides.nextafterp  s    +&&rG   c                    U  SU 3$ Nz & rR   r   s     rE   logical_andHalideOverrides.logical_andt      Cs|rG   c                    U  S3$ )Nz == 0rR   r   s    rE   logical_notHalideOverrides.logical_notx  s    E{rG   c                    U  SU 3$ Nz | rR   r   s     rE   
logical_orHalideOverrides.logical_or|  rh  rG   c                    SU  SU S3$ )Nr    ^ r9   rR   r   s     rE   logical_xorHalideOverrides.logical_xor  s    1#S1~rG   c                    U  SU 3$ re  rR   r   s     rE   bitwise_andHalideOverrides.bitwise_and  rh  rG   c                    SU  3$ )N~rR   rj  s    rE   bitwise_notHalideOverrides.bitwise_not  s    1#wrG   c                    U  SU 3$ rn  rR   r   s     rE   
bitwise_orHalideOverrides.bitwise_or  rh  rG   c                    U  SU 3$ )Nrr  rR   r   s     rE   bitwise_xorHalideOverrides.bitwise_xor  rh  rG   c                    U  SU 3$ )Nz << rR   r   s     rE   bitwise_left_shift"HalideOverrides.bitwise_left_shift      D}rG   c                    U  SU 3$ )Nz >> rR   r   s     rE   bitwise_right_shift#HalideOverrides.bitwise_right_shift  r  rG   c                    SU  SU S3$ )Nzhalide_helpers.rand(rb   r9   rR   seedoffsets     rE   randHalideOverrides.rand  s    %dV2fXQ77rG   c                    SU  SU S3$ )Nzhalide_helpers.randn(rb   r9   rR   r  s     rE   randnHalideOverrides.randn  s    &tfBvha88rG   c           	          SU  SU SU SU S3	$ )Nzhalide_helpers.randint64(rb   r9   rR   )r  r  lowhighs       rE   	randint64HalideOverrides.randint64  s#    *4&6("SED6KKrG   c                    [         R                  " U S5       S[        R                  R                  R                  SU5       3$ )Nr    + load_seed_offset)opsloadr#   rc   ru   seed_offset)r  r  s     rE   	load_seedHalideOverrides.load_seed  s7    ((4#$C(A(ABTV\(]'^__rG   c                    SU  S3$ )Nz1./hl.sqrt(r9   rR   r  s    rE   rsqrtHalideOverrides.rsqrt  s     QCq!!rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   tanHalideOverrides.tan  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   tanhHalideOverrides.tanh  r  rG   c                    SU  S3$ )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rR   r  s    rE   signbitHalideOverrides.signbit  s    DQC~VVrG   c                    U  SU  SU SU 3$ )Nz - hl.trunc(/z)*rR   r   s     rE   fmodHalideOverrides.fmod  s!     L1QCr!--rG   c                    SU  SU S3$ )Nzhl.pow(rb   r9   rR   r   s     rE   powHalideOverrides.pow  s    2aS""rG   c                    [        S5      e)Nldexpr3  )r   r   s     rE   r  HalideOverrides.ldexp  s    '""rG   c                    SU  S3$ )Nzhl.log(r9   rR   r  s    rE   logHalideOverrides.log  r  rG   c                    [        S5      er   r   r  s    rE   r   HalideOverrides.log2  s    !&))rG   c                    SU  S3$ )Nz hl.is_inf(hl.cast(hl.Float(32), r   rR   r  s    rE   isinfHalideOverrides.isinf       2!B77rG   c                    SU  S3$ )Nz hl.is_nan(hl.cast(hl.Float(32), r   rR   r  s    rE   isnanHalideOverrides.isnan  r  rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   roundHalideOverrides.round  rE  rG   c                    SU  S3$ )Nr{   r9   rR   r  s    rE   floorHalideOverrides.floor  rE  rG   c                    SU  SU S3$ )Nr   r   z + hl.f32(0))rR   r   s     rE   int_truedivHalideOverrides.int_truediv  s    1#U1#]++rG   c                .    SU R                    SU  SU S3$ )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r9   r  r   s     rE   floordivHalideOverrides.floordiv  s)     18J1#TRSQTTUV	
rG   c                <   [         R                  " [         R                  " SU5      [        R                  5      n[         R                  " [         R                  " US5      [        R                  5      n[         R
                  " X#5      nSUR                   SU S3$ )N0ra   r  r9   )r  r   ltr<   int8subr  )r  r   leftrightr  s        rE   signHalideOverrides.sign  se    ||CFF3NEJJ7SVVAs^UZZ8ggd"!&&3%q11rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   truncHalideOverrides.trunc  rE  rG   c                .    SU R                    SU  SU S3$ )Nz"hl.trunc(hl.cast(hl.Float(max(32, r  r  r9   r  r   s     rE   truncdivHalideOverrides.truncdiv  s)    
 18J1#TRSQTTUV	
rG   c                    SU  S3$ )Nr   r9   rR   r  s    rE   ceilHalideOverrides.ceil  r  rG   c                    SU  S3$ )Nr   z, 0)rR   r  s    rE   reluHalideOverrides.relu  s    4  rG   c                |   [         R                  R                  U5      n[         R                  R                  [         R                  R	                  U5      [         R                  R                  U5      [        U5      S9nU[        R                  [        R                  4;  a  [        R                  " XB5      $ U$ )N)bounds)r#   rc   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r<   r   r>   r  r   )r  rf   r   indexvars        rE   
index_exprHalideOverrides.index_expr
  s    ))$/hhHH!!%(HH))%0(.  

 ekk22<<++
rG   c                    [         R                  " U[        R                  5      n[         R                  " XU5      nX!l        [        [        U5      5      $ r   )r  r   r<   r   halide_clampindirect_indexing_sizer    str)r  	index_varsizecheckwrap_negs        rE   indirect_indexing!HalideOverrides.indirect_indexing  s?     LLEKK8	$$Ye<	+/(!#i.11rG   c                   [         R                  R                  [         R                  R                  U5      S-
  5      n[	        U[
        [        R                  45      (       d  SUR                   SU S3nSU SU S3$ )Nr$   ra   r  r9   z	hl.clamp(z, 0, )	r#   rc   kexprrename_indexingr:   r;   r   Integerr  )r  r  r  r  ends        rE   r  HalideOverrides.halide_clamp  sm    hhnnQXX55d;a?@$emm 455UZZL	#a8C 5'se1--rG   c                   [         R                  R                  X5       nU" 5       nS S S 5        WR                  R                  (       a  [        U5      n[         R                  R                  SUR                   S[        U5       S3/ [        R                  " U5      UR                  S9n[        R                  " WXB5      $ ! , (       d  f       N= f)Nra   r  r9   r  shape)r#   rc   
mask_loadsr  is_boolr   r  r  rF   r   wrapr  r  r)  )maskbodyothernew_maskresults        rE   maskedHalideOverrides.masked'  s    XX  -VF . ==  KE   v{{m9_U-C,DAF##E*,,	 ! 
 yy611 .-s   C
Cc                    [        S5      e)Nfrexpr   r  s    rE   r  HalideOverrides.frexp9  s    !'**rG   c                    [        S5      e)Ndevice_assert_asyncr   )condmsgs     rE   r  #HalideOverrides.device_assert_async=  s    !"788rG   c                    [         er   r   )r  reduction_typer  
extra_metas       rE   partial_accumulate"HalideOverrides.partial_accumulateA  s
     "!rG   rR   NT)r   torch.dtyper   Optional[torch.dtype])r   r  r   r  )TT)
r  r  r  r  r  r&   r  zdict[str, Any]rT   rU   )JrV   rW   rX   rY   r   r   r  classmethodr  r  r  r  r!  r&  r)  r,  r/  r2  r7  r:  r=  r@  rC  rG  rJ  rO  rS  rV  rY  r\  r_  rb  rf  rk  ro  rs  rv  rz  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  rZ   rR   rG   rE   r   r      s    ,0	55 )5 5   ; ;   k k
   t t t t + +
     $ $                   % %       & & $ $ % % ' '                     8 8 9 9 L L ` ` " "     W W . . # # # #   * * 8 8 8 8         , , 
 
 2 2     
 
   ! ! 	 	 2 2 . . 2 2" + + 9 9 """ " #	"
 
" "rG   r   halidec                     ^  \ rS rSr\R
                  " S5      r  S	       S
U 4S jjjrS rS r	SS jr
S rSrU =r$ )HalideCSEVariableiO  z\b(tmp\d+)\[\?\]c                0   > [         TU ]  XX4S9  S U l        g Nr  )rL   rM   	used_dims)rN   r  r  r   r  rP   s        rE   rM   HalideCSEVariable.__init__R  s     	u:7;rG   c                n   [        U R                  =(       d    S5      n[        R                  " X#R	                  5       5       HK  n[        U[        5      (       d  M  UR                  c	   XU45       eUR                  UR                  5        MM     [        R                  R                  U5      U l        g )NrR   )r   r#  	itertoolschainvaluesr:   r  updater#   rc   sort_used_dims)rN   r  ru   kwargsusedargs         rE   update_on_args HalideCSEVariable.update_on_args\  s~    $...B/??49C#011}}0C4d2CC0CMM* : 006rG   c                    [        U5      S:X  a  U R                   S3$ U R                   SSR                  [        [        U5      5       S3$ )Nr   z[()][rb   ])rt   r  joinmapr  )rN   dimss     rE   	index_strHalideCSEVariable.index_strd  sE    t9>ii[%%))AdiiC78::rG   c                p    U R                   c  U R                   S3$ U R                  U R                   5      $ )Nz[?])r#  r  r6  )rN   s    rE   __str__HalideCSEVariable.__str__j  s0    >>!ii[$$~~dnn--rG   c           	         U R                   b!  [        S U R                    5       5      (       d   eU R                  U R                    Vs/ s H  o!R                  X"5      PM     sn5      $ s  snf )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   r:   r   Expr.0r   s     rE   	<genexpr>-HalideCSEVariable.subs_str.<locals>.<genexpr>q  s!      2
/=!Jq%**%%~   '))r#  allr6  get)rN   replacementsr   s      rE   subs_strHalideCSEVariable.subs_strp  sc    ~~)c 2
/3~~2
 /
 /
 	
 
 ~~t~~N~!//5~NOONs   
A+)r#  )NN)r  zValueRanges[Any]r   r  r  r7   rT   rU   )rT   r  )rV   rW   rX   rY   recompileundefined_rerM   r.  r6  r9  rG  rZ   r[   r\   s   @rE   r  r  O  sg    ::12L (, $< !< %	<
 < 
< <7;.P PrG   r  c                  V   ^  \ rS rSr% S\S'   S\S'   S\S'   S
U 4S jjrSS jrS	rU =r$ )DimensionInfoiw  zOptional[sympy.Expr]rf   
sympy.Exprr  stridec                   > [         TU ]  5         [        R                  R                  R                  US5      (       a  U* nU* nXl        X l        X0l        g Nr   )	rL   rM   r#   graphsizevarsstatically_known_ltrf   r  rO  )rN   rf   r  rO  rP   s       rE   rM   DimensionInfo.__init__}  sH    77//::WF5D		rG   c                    U R                   c   eU R                   nU(       a  US:X  a  gU(       a  0 UEnUR                   H  n[        U[        R                  5      (       d  M$  [        U[        R                  5      (       d   e[        R                  R                  UR                  5      n[        U[        5      (       d   e[        UR                  U5      5      X'   M     [        X15      n[        R                  R!                  U5      $ )Nr   hl.Var())rf   free_symbolsr   r   TMPr:   r   Symbolr#   rc   lookup_cse_varr  r  r    rG  r!   r  )rN   rF  	zero_varsrf   symr  s         rE   r6  DimensionInfo.index_str  s    yy$$$yy+l+L((!#txx00%c5<<8888((11#((;C%c+<====(:3<<;U(VL% ) d1Dxx$$T**rG   )rf   r  rO  rS   NF)	rV   rW   rX   rY   __annotations__rM   r6  rZ   r[   r\   s   @rE   rM  rM  w  s$    

+ +rG   rM  c                   [         R                  R                  R                  X5      (       a  g [         R                  R                  R	                  U 5      n[         R                  R                  R	                  U5      nX#:X  a)  [         R                  R                  R                  X5        X#:H  $ ! [
         a     gf = fNTF)r#   rR  rS  statically_known_equalssize_hint_or_throw	TypeErrorcheck_equals)r  r  r   r   s       rE   eqrg    s    ww//<<GG//5GG//6 	v	%%d26M	  s   AB5 5
CCc                   [         R                  R                  R                  X5      (       a  g [         R                  R                  R	                  U 5      n[         R                  R                  R	                  U5      nX#:  a)  [         R                  R                  R                  X5        X#:  $ ! [
         a$    [        R                  " X5      nX@:X  a  X:g  s $  gf = frb  )	r#   rR  rS  rT  rd  re  r   gcdcheck_lt)r  r  r   r   ri  s        rE   r  r    s    ww++D88GG//5GG//6 	u	!!$.5L  ii$;= 	s   AB5 5)C#"C#c                    ^  \ rS rSr% \r\rS\S'       S&U 4S jjr	S'S jr
S(S jrS)U 4S jjrS	 rS
 r  S*U 4S jjrS rS+S jrS rS rS*S jrS rS,S jrS-S jrS.S jr S/         S0S jjr          S1S jrS r        S2S jr\R<                  " 5       SS.   S3S jjrSS.S3S jjr S.S jr!S r"S4S jr#S/S  jr$\%S! 5       r&S5S6S" jjr'S# r(        S7S$ jr)S%r*U =r+$ )8HalideKerneli  zCallable[[sympy.Expr], str]r  c                z  > [         TU ]  " U40 UD6  U R                  U l        U R                  U l        U R                  U l        [        5       U l        U R                  U l	        U R                  U l
        0 U l        0 U l        0 U l        0 U l        0 U l        0 U l        [#        [$        5      U l        SU l        g r_  )rL   rM   r  computeloadsstoresr(   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rN   tilingr+  rP   s      rE   rM   HalideKernel.__init__  s    
 	*6*yyYY
ii!/!1"&"7"7!22AC57;=@BCEHJ4?4E%*"rG   c                    [        U5      $ r   )r   )rN   r   s     rE   dtype_to_strHalideKernel.dtype_to_str  s    5!!rG   Nc                ^    U R                   R                  U SU< S35        [        XX45      $ )Nz = hl.Func(r9   )r  	writeliner  )rN   r  r  r   r  s        rE   create_cse_varHalideKernel.create_cse_var  s.    		tfKxq9: u<<rG   c           
     J  >^^^ U R                   (       d"  U R                  (       d  U R                  (       a   e[        R                  " [
        R                  R                  R                  [        S9m[        R                  [        [        T U ]8  U5      5      n[        [            " 5       m["        R$                  R'                  U R(                   Vs/ s H  o"R*                  R-                  5       PM     sn5       Vs0 s H  nUR/                  5       U_M     snmS nUU4S jnUU4S jnU H  nUR1                  [2        5      (       aY  UR5                  [3        [6        R8                  " S5      [6        R8                  " S5      [6        R8                  " S5      5      U5        UR1                  [:        5      (       aD  UR5                  [;        [6        R8                  " S5      [6        R8                  " S5      5      U5        TR=                  [        T U ]9  U5      R>                  5        M     [A        S T 5       5      U l!        S	n[E        U R(                  5       GH  nUR*                  R-                  5        Vs/ s H  o3R/                  5       T;   d  M  UPM     n	nU	RG                  U4S
 jS9  U	(       d+  U	RI                  URK                  SURL                  5      5        Sn
[6        RN                  RP                  n/ nU
[S        U	5      :  Ga  [U        URL                  U5      (       Gd  U	 Vs/ s H2  n[U        URV                  U5      (       d  M   U" URX                  5      PM4     nnU
[S        U5      -  n
U(       d   U	5       eU[        RZ                  " [
        R                  R                  R\                  U5      -  nUR_                  U	 Vs/ s HQ  n[a        XRV                  5      (       d  M  [a        URV                  U5      (       d  M<  U" URV                  U-  5      PMS     sn5        U(       Ga  [        RZ                  " [6        Rb                  U5      n[U        US5      (       a7  U" URL                  U-  5      n[U        US5      (       a   e/ n[S        U	5      n
Sn[e        S[S        U R                  5       35      nURf                  (       a.  [e        S[S        U R                  5       35      U R                  U'   XR                  U'   URI                  UU45        X-  nU	 Vs/ s H,  n[U        URV                  U5      (       d  M   URX                  PM.     nnU
[S        U5      -  n
[S        U5      nU Vs/ s H/  n[U        UU5      (       a  M  [6        Rh                  " UU-  5      PM1     nn[S        U5      U:  d  US:X  d   eUR_                  U5        U(       a  GM  U
[S        U	5      :  a  [U        URL                  U5      (       d  GM  U	 H  n SnSn[U        URV                  U5      (       d/  UU   u  nnUS-  nUU-  n[U        URV                  U5      (       d  M/  Sn[6        RN                  Rj                  n[U        URX                  U5      (       d7  UU   u  nnUS-  nUUU-  -  nUU-  n[U        URX                  U5      (       d  M7  UU R                   UR/                  5       '   M     GM     U R                   H/  nU Rp                  Rs                  U SURt                  < S35        M1     U R                  (       aO  U Rw                  SU R                  Ry                  5        VVs0 s H  u  nnUU R                  U   _M     snn5        ggs  snf s  snf s  snf s  snf s  snf s  snf s  snf ! [l         a    U(       d   e[6        RN                  Rj                  n[6        RN                  RP                  nU H  u  nnUUU-  -  nUU-  nM     [
        R                  R                  Ro                  [3        UURV                  URX                  5      U R                  5      U R                   UR/                  5       '    GM{  f = fs  snnf )a  
Hook called right before codegen with every index that will be
used in the fused kernel.

This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
we base indexing on a larger number of vars whose product combines to those.

This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
fallbackc                |    [         R                  " [        R                  R                  R                  U 5      5      $ r   )r   simplifyr#   rR  rS  remove_precomputed_replacementsre   s    rE   r  0HalideKernel.finalize_indexing.<locals>.simplify  s+    >>  @@F rG   c                "  > U T;   a  TU    nTR                  UR                  R                  UR                  U-  [        R
                  R                  R                  U[        UR                  U5      5      5      R                  5       5        g g r   )addrootlookupdivisorr#   rR  rS  evaluate_minr   lengthsymbol)baser  modulusnodeall_used_symbolssym_to_nodes       rE   visit_modular_indexing>HalideKernel.finalize_indexing.<locals>.visit_modular_indexing  sw    {""4( $$II$$w.((55#Xdkk7%C
 fh #rG   c           	        > U T;   a`  TU    nTR                  UR                  R                  UR                  U-  [	        UR
                  U5      5      R                  5       5        g g r   )r  r  r  r  r   r  r  )r  r  r  r  r  s      rE   visit_floor_div7HalideKernel.finalize_indexing.<locals>.visit_floor_div  s]    {""4( $$II$$w. g6 fh	 #rG   r  r  r  c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   )r   r   INDIRECT)r@  r]  s     rE   rA  1HalideKernel.finalize_indexing.<locals>.<genexpr>&  s"      )
:J3N3..:JrC  Fc                (   > T" U R                   5      $ r   )r  )r   	size_hints    rE   <lambda>0HalideKernel.finalize_indexing.<locals>.<lambda>-  s    Yqyy%9rG   keyr$   r   Thhrz
 = hl.Var(r9   rdomN)=rx  rw  ry  	functoolspartialr#   rR  rS  r  r   dictfromkeysr4  rL   r  r   r   r&  r'  from_iterablerange_treesnodesr(  r  hasr   replacer   Wildr   r)  rX  anyr}  reversedsortappendr  numelSOnert   rg  r  r  reduceevaluate_maxextendr  ri  r    is_reductionr  Zero
IndexErrorsimplify_with_rangesindexing_coder  r  codegen_rdomitems)!rN   indicestreer   r  r  r  r  had_fallbackr  handled_countr  added_sym_sizesizes_to_addr  	next_sizer]  	new_sizes	prior_lensr  idxr  r  rf   
full_indexrO  vrvr  r  r  rP   s!                                @@@rE   finalize_indexingHalideKernel.finalize_indexing  s&    ##t'7'74;Q;Q	
 
 %%agg&6&6&@&@3O	--EG$<g FG%c?, __22151A1AB1A""$1AB
 HHJM
	

		 Eyy))#

6*

9-

9-
 + yy""

6*

9- $ ##EG$<U$C$P$PQ% ( &) )
:J)
 &
" T--.D $

 1 1 3V 31xxzEU7UQ 3EVJJ9J:T[[DJJ78MggkkGN#e*,R

G5L5L05 051AIIw9O&HQXX&    \!22#*U*|	 0 0GG$$11<!  ## "'!&Agyy1 668C6H 6W!45!& # ) 0 0L II)Q'' %-TZZ'-A$B	#%i#3#333')(+E
'+,qT5E5E1F0G-HIC((6H T%5%5!6 787..s3 -6$$S)"))3	*:;(G38 S5aBqyy'<R5I S!S^3M #L 1I ".$!-A!!Y/ 6q9}5!- ! $
 |,y8INJJ ''	29 #l#  #e*,R

G5L5L` CG w77$23$7	Tq4 !w77 F 77<<D f55$23$7	Tq,$	 !f55
 >BD++DKKM: s /p ##C((C5
388,a)HI $!!6:6L6L6R6R6TU6TUQT%%a((6TU "g C
z W 4 !T$2 " ''<!&J"WW[[F%3	T"fsl2
$ &4 ((==+JdkkR ,, ++DKKM:( Vsu   #^&^+ ^0^06^5^5^:
:^:
^:
5^?^?_ _$A_	2A,_	 _	?b	Cbbc           
     "   U R                   (       a  SOSnXR                  ;   a  U R                  U   $ 0 nU R                   Hp  nU R                   (       d  X0R                  ;   a  M%  [        R
                  " SUR                  5      nU(       d   e[        SU UR                  S5       35      X#'   Mr     U R                  U S3UR                  5        VVs0 s H  u  pVX`R                  U   _M     snn5        X R                  U'   U$ s  snnf )zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r$   dom)rr  rz  rw  ry  rI  matchr  r    groupr  r  )rN   prefixrenamesr  mr  r  s          rE   setup_dom_indexingHalideKernel.setup_dom_indexing  s    --3%%%##F++##C((S4J4J-Jchh/AH1-&!''!*.FGGL $ 	hcN'--/R/R!1!1!!44/R	
 $+ 	 Ss   Dc           	     ~   UR                  5        Vs/ s H'  nSU R                  U R                  U5      5       S3PM)     nnU R                  R	                  U SSR                  U5       S35        [        UR                  5       5       H)  u  pVU R                  R	                  U SU SU S35        M+     g s  snf )	Nhl.Range(0, r9   z = hl.RDom([rb   ]) = r1  r2  )r(  r  r  r  r  r3  	enumeratekeys)rN   r  varsr  rsizesr  rsyms          rE   r  HalideKernel.codegen_rdom  s     
% 4::d&:&:4&@AB!D% 	 
 	$$v\$))F:K9LB%OP -GA((D6TF!A3a)@A .
s   .B:c                   > [         TU ]  U5      n[        XR                  5      n[        R
                  R                  R                  XR                  5      $ r   )	rL   r  r!   rx  r#   rR  rS  r  rw  )rN   r  rP   s     rE   r  HalideKernel.prepare_indexing  sE     (/5"9"9:ww44U<L<LMMrG   c                    [        U[        R                  5      (       a%  U R                  UR                  5      R
                  $ U R                  U   $ )zThe size of an index symbol)r   r   rY  r[  r  r  rw  )rN   r]  s     rE   sym_sizeHalideKernel.sym_size  s?    #txx((&&sxx0GGG$$rG   c           	     	  ^ ^^ / m[        UR                  S S9 H  n[        U[        R                  [        R
                  45      (       a  TR                  U5        ME  [        U[        R                  [        R                  [        R                  45      (       a  M   U5       e   [        R                  R                  n[        R                  T[        R                  R                  5      n/ n[        R                  " T R!                  U5      5      n[#        U[        R$                  5      (       a  UR&                  OU/ H  nUR                   V	s/ s H  oU;   d  M
  U	PM     n
n	[)        U
5      S:X  a  XX-  nM;  [)        U
5      S:X  a  XjS   ==   U-  ss'   M[  / n[+        [)        U5      5       Hm  nX|   c   eX|   u  p[-        U5      [-        U
5      -  (       a/  U
R/                  U V	s/ s H  oU
;  d  M
  U	PM     sn	5        X-  nM[  UR                  X45        Mo     / UQX4PnM     UU U4S jn/ nU H;  u  nnU H  n	UUR1                  U	5      -  nM     UR                  U" UU5      5        M=     UR3                  5        H  u  nnUR                  U" UU/5      5        M!     UR5                  S S9  U(       dF  T R6                  (       a4  UR                  [9        [        R                  R                  SS5      5        O[:        R<                  R>                  RA                  US   RB                  S5      (       dK  URE                  S[9        [        R                  R                  T(       a  SOUS   RB                  S5      5        U(       a  T(       d  UT RF                  ;   an  [:        R<                  R>                  RI                  UT RF                  U   5      (       a2  T RK                  UUT RF                  U   -
  5        T RF                  U   nOC[:        R<                  R>                  RM                  US5      (       a  T RK                  UU5        SnUn[N        RP                  " 5        He  nT RS                  UUUT5      (       a  UU4s  $ T(       a   eU SU 3nUT RT                  U   ;  d  MG  T RT                  U   R                  U5        Mg     gs  sn	f s  sn	f )	zEConvert address-based indexing into dimensions using self.halide_varsc                    U R                   $ r   r  r  s    rE   r  5HalideKernel.indexing_to_dimensions.<locals>.<lambda>  s    AFFrG   r  r   r$   Nc                ,  > [         R                  " U 5      n [        U5      S:X  aV  [         R                  " ST
S9nU R	                  X!S   -  5      nU(       a$  [        US   T	R                  US   5      X2   5      $ T(       a   U 5       e[         R                  " [        X Vs0 s H  oDT	R                  U5      S-
  _M     sn5      S-   5      n[         R                  R                  n[        U [         R                  5      (       a|  U R                   Hl  n[        U[         R                  5      (       d  M$  Xg-  n[         R                  " X-  5      n [         R                  " [         R                  " XW-  5      5      nMn     [        XU5      $ s  snf )Nr$   wild)excluder   )r   factorrt   r  r  rM  r  r  r!   r  r  r:   Mulru   r  ceiling)rf   symsstride_wildr  r]  r  rO  termis_storerN   symbolss           rE   expr_to_dimension>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension  s<   <<%D4yA~#jjAJJ{!W45(QtAw!7   %%<^^4!N#t}}S'9A'="=!NORSSF WW[[F$		** IID!$66$~~dk:!&fm0L!M	 &
 !v66 "Os   $Fc                n    [         R                  R                  R                  U R                  [
        S9$ )Nr  )r#   rR  rS  r  rO  r   )ds    rE   r  r    s"     0 0 : :188c : RrG   _view)+sortedrX  r   r   HALIDErY  r  UNBACKED_INTSIZEPRECOMPUTED_SIZEr   r  r  r  r  expandr  r:   Addru   rt   ranger   r  popr  r  r}  rM  r#   rR  rS  rc  rO  insertrv  statically_known_geqapply_offset_to_dimensionstatically_known_gtr&  countinstall_dimsr|  )rN   r  r  r  r]  r  
split_exprsplit_failedpartr  	part_varsnew_split_failedr  
other_vars
other_partr  r5  r  rf   orig_varr  s   `  `                @rE   indexing_to_dimensions#HalideKernel.indexing_to_dimensions  s   %,,2BCCcDKK#:;;s#%))		--    	 D ]]7EGGLL9
DFT11%89",UEII">">EJJUGKD$($5$5I$5qj$5II9~"Y1$Q<(D0(#% s<01A'?666-9_*J!*-
90EE!((Z)VZICU!Z)VW*(//0HI 2  F!1EI3DE! L$	7. &JD$
q)) KK)$56 ' $))+ICKK)$67 ,		R	S))M%'',,1=>!!99$q'..!LLKK=Hq$q'..RST d)))agg.>.>.S.S++C0/ / ..tVd>Q>QRU>V5VW,,S1!!55  ..tV<"A  dFH==Dy <JeA3'C$--h77##H-44S9 #U J *Ws   	S%$S%2	S*?S*c                |   XR                   ;  a  X R                   U'   X0R                  U'   gU R                  U   U:w  d%  [        U R                   U   5      [        U5      :w  a  gU(       a  U R                   U   U:H  $ [        U R                   U   U5       H  u  pVUR                  UR                  :w  a    gUR
                  UR
                  :w  d  UR                  UR                  :w  d  MW  [        R                  R                  R                  UR
                  UR
                  5      Ul        SUl        M     g)z>Try to set self.buffer_dimensions[var], return True on successTFN)ru  rv  rt   ziprO  r  rf   r#   rR  rS  r  )rN   r  r5  r  r  oldnews          rE   r  HalideKernel.install_dims  s   ,,,*.""3''-$s#v-""3'2
Y2 ))#.$66D2237>HCzzSZZ'xx388#sxx388';77++88388L ? rG   c                   US:X  a  g [        [        [        U5      5      5       H  nX   R                  S:X  d=  [        R
                  R                  R                  X!U   R                  5      (       d  MR  [        X!U   R                  5      nX$X   R                  -  -  nX   =R                  U-  sl	        M     US:X  d   eg )Nr   r$   )
r  r  rt   rO  r#   rR  rS  r  r   rf   )rN   r5  r  r  r  s        rE   r  &HalideKernel.apply_offset_to_dimension3  s    Q;%D	*+Aw~~"agg&6&6&K&KQ' '  Q7//$ , {{rG   c                   [         [        R                     " 5       nUR                   GH-  n[	        U[        R                  5      (       d   e[        U[        R                  5      (       a\  U R                  UR                  5      n[	        U[        5      (       a  UR                  c   eUR                  UR                  5        M  [        U[        R                  5      (       a  UR                  U5        M  [        U[        R                  [        R                   [        R"                  [        R$                  45      (       a  GM"  ['        SU 35      e   U R)                  U5      $ )zIDetect which range trees are used to populate HalideCSEVariable.used_dimszunhandled symbol )r   r   rZ  rX  r:   r   r   rY  r[  r  r  r#  r)  r  r  r  r  r  INDEXr   r*  )rN   r  r#  r]  cse_vars        rE   r  !HalideKernel.used_dims_from_index?  s   u||,.	%%Cc5<<0000c488,,--chh7w(9::))56   !2!23T[[11c"d''D4I4I4::V  ),=cU*CDD# &$ ""9--rG   c                   [        S U 5       5      (       d   e[        R                  " U R                  U R                  R                  5       5       Vs/ s H  nX!;   d  M
  UPM     nn[        U5      [        U5      :X  d   eU$ s  snf )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   r=  r?  s     rE   rA  .HalideKernel.sort_used_dims.<locals>.<genexpr>W  s     @i:a,,irC  )rD  r&  r'  rw  ry  r(  rt   )rN   r#  r]  ordereds       rE   r*  HalideKernel.sort_used_dimsV  s    @i@@@@@ !  $"8"8"?"?"A
 	  	 
 7|s9~---
s   	B"Bc                   ^^ SR                  UU4S jU 5       5      n[        U5      S:X  a  SnU$ [        U5      S:X  a  U S3nU$ )Nrb   c              3  F   >#    U  H  oR                  TT5      v   M     g 7fr   )r6  )r@  r  rF  r\  s     rE   rA  .HalideKernel.make_index_str.<locals>.<genexpr>c  s     QDqkk,	BBDs   !r   ()r$   ,)r3  rt   )rN   r5  rF  r\  r6  s     `` rE   make_index_strHalideKernel.make_index_strb  sM    IIQDQQ	t9>I  Y!^$+QIrG   c                J   U R                   R                  U5      nU R                  U5      nU R                  X2S5      u  p4U SU R	                  U5       S3n[
        R                  R                  U5      nU[        R                  [        R                  4;   a  [        R                  nSU S3nU R                  (       Ga  [        U R                  [        5      (       a  U R                  R                  c   e[!        / U R#                  U5      QU R                  R                  Q75      nU R%                  U R'                  U5      5      nUR                  (       a  U R(                  R+                  UR,                   S35        U R(                  R+                  UR,                   SU R                   S35        U R/                  U R0                  =(       d    S5      n	U R(                  R+                  U S	[3        U5       S
U	 S35        U R(                  R+                  U SU S[3        U5       S
UR,                   S35        U$ U R(                  R+                  U SU R                   S
U S[3        U5       S35        U$ U R5                  XPR#                  U5      5      $ )z"Codegen a load from an InputBufferFr1  r2  rj   r9   z!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(rb   r  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))ru   inputr  r  r2  r#   rR  	get_dtyper<   r   r   r   
_load_maskr:   r  r#  r   r  newfuncr*  r  r  r  r  _load_otherr   r  )
rN   r  r  r  r5  r  r   r#  r	  r  s
             rE   r  HalideKernel.loadk  sK   iiood#%%e,//EB	a++D12!4!!$'U]]ENN33MME+D63D???4??,=>>OO--9: #O$++E2OT__5N5NOI \\$"5"5i"@AF		##v{{m3T$UV		##v{{m<?PPQ$RS

4#3#3#8q9		##hk+e*<)=RwaH 		##hc${;u3E2FbU[\ M 		##hmDOO+<BtfJ{[`OaNbbgh M<<&?&?&FGGrG   c                `    U R                   R                  [        R                  " SSU5         $ )Nz\[.* )csevarname_maprI  r  rN   r  s     rE   r[  HalideKernel.lookup_cse_var  s$    xx##BFF7B$=>>rG   c                <   [        U[        5      (       d   eU R                  R                  U5      nU R	                  U5      nU R                  XRS5      u  pVU R                  U5      (       d  Ub  U R                  5       nU R                  Xg5      nUR                  U5      n	SR                  S/[        U5      -  5      =(       d    Sn
U R                  R                  [        X SU
 SU S35      5        OU R                  USS	9n[        U5      n	[         R"                  R%                  U5      nUc  U SU S
['        U5       SU	 S3nO,US:X  a  U SU S['        U5       SU	 S3nO[)        SU 35      eU R                  R                  [        X5      5        g)z"Codegen a store to an OutputBufferTNrb   rW  r0  r1  z] = hl.undef(z.type()))r\  z] = hl.cast(r9   
atomic_addz] += hl.cast(zstore mode=)r:   r  ru   outputr  r  is_indirect_indexingr  r2  rG  r3  rt   r  r  r'   r  r#   rR  r6  r   r   )rN   r  r  r  moder  r5  rF  r6  	value_str
undef_dimsr   r  s                rE   storeHalideKernel.store  s    %!23333iit$%%e,//DA	$$U++t/?224L++D?I|4I))ZL3t9$<=F$JIITU!J<}SE#RS ++DD+AIE
I!!$'<U!I;l;u3E2FbSTUD\!U!I;mK4F3Gr)TUVD%D6&:;;		L45rG   c           	        U R                   (       d   eU R                  (       a   eX#U4nXPR                  R                  ;   a  U R                  R                  U   $ [	        U[
        5      (       a2  US:X  d   eU R                  " U6 =U R                  R                  U'   nU$ [	        U[        5      (       a  UR                  c   e[        U R                  5      nU R                  UR                   Vs/ s H  oU;  d  M
  UPM     sn5      n	U[        UR                  5      -
  (       aC  U R                  U U R                  [        / UR                  QUQ75      5      UR                  S9nUR                  U R                  5      n
[         R"                  R%                  X25      n['        U5      nUS;   a  U	R(                   SU 3nU R*                  R-                  U SU SU
 S35        / nS	n[/        U R                  5       HD  u  nnUR1                  U S
U S35        US	:w  a  US==   SU 3-  ss'   XR2                  U   -  nMF     U R*                  R-                  U	 SSR5                  U5       35        OUS:X  a  U R7                  X5      n	O[9        X<5      n[:        R<                  " [?        [A        5       5      5         U" X5      nSSS5        SU S[C        U5       S3nU R*                  R-                  U	 SU 35        U R*                  R-                  U	 SW 35        XR                  R                  U'   U	$ s  snf ! , (       d  f       N= f)zCodegen a reduction operationwelford_combineNr"  )argmaxargmin_z = hl.z(rdom, r9   r$   r1  r2  *r  r  welford_reducera   rb   )"rr  r7  r=  reduction_cacher:   tuplewelford_combine_implr  r#  r   ry  r8  r  r*  r  rG  r   	Reductiondefault_accumulatorr   r  r  r  r  r  rw  r3  welford_reduce_fallbackr   r#   set_ops_handlerr   r   rF   )rN   r   r   r  r  	cache_keyresult_tuplereduction_varsr  
result_varrF  defaultacc_typer  partsrO  r  r]  
combine_fncombine_strdefault_strs                        rE   	reductionHalideKernel.reduction  s    $$$$??""6	00088++I66eU##!%6666))51DHH$$Y/,  %!2338SSS#D$:$:;\\C1N+BQC

 Ju77LL'##J/R/R>/R$STkk ! E
 NN4#9#9:	,,22>M"5)11!'q(89EII5'/?wykQR STEF#D$:$:;3was!_-Q;"I1VH-I**3// < II:,c%**U2C1D EF//55eCJ1.KJ""??3D#EF(? G$XJb1I0J!LKII:,c+ ?@II:,c+ ?@.8  +M D> GFs   6	MM	M
Mc                   [        U[        5      (       a  UR                  c   e[        U[        5      (       a  UR                  c   e[        U[        5      (       a  UR                  c   e[        / UR                  QUR                  QUR                  Q7=(       d    U R                  5      nU[        U R
                  5      -  nU R                  U R                  U5      5      nXU4 Vs/ s H  nSUR                   S3PM     nnUR                  nU R                  R                  U SSR                  U5       S35        U R                  R                  U SU S35        U R                  R                  U SU S	35        U R                  R                  U S
U S35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SUR                  U R
                  5       35        U R                  R                  U SU SU S35        U R                  R                  U SU SU S35        U R                  R                  U SU SU SU S35        U SU SU S3U SU SU SU SU SU S3U S3/n	U R                  R                  U SSR                  U	5       S35        / n
[        S 5       HT  nU
R                  U R                  UR                  5      5        U R                  R                  U
S!    S"U S#U S$35        MV     [        U
5      $ s  snf )%Nra   z.type(), 0)z = hl.Tuple([rb   r  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   rO  r  r1  r2  )r:   r  r#  r   rw  ry  r8  r*  r  r  r  r3  rG  r  r  rS  )rN   meanm2weightr#  r\  r   r]  pfxr)  unpackedr  s               rE   rT  !HalideKernel.welford_combine_impl  sh   $ 122t~~7QQQ"/00R\\5MMM&"3449I9I9UUU?dnn?r||?f.>.>?S4CSCS
	 	Z 6 677	\\$"5"5i"@A
<@f;MN;MaXaffX[1;MNoo		zl-		'8J7K2NO		se:j\=>		se8J<s;<		se<
|3?@		se:dmmD<R<R.S-TUV		se8BKK8N8N,O+PQR		e<0F0F GHI	
 			se9SEC5HI		se>#l3%yQR		e*3%/H\Z]Y^^jk	
 e:cU)C5
;e8C5Yse9SEVYUZZdee;

 			zl-		&8I7J"MNqAOODLL)=)=>?II8B<.J<q1 EF  X7 Os   .Nc           
        U R                   (       d   e[        U5      [        U5      :X  d   e/ n[        [        R                     " 5       nU H  n[        U[        5      (       a  UR                  c   e[        UR                  5      [        U R                  5      -  (       a  UR                  U5        OHUR                  U R                  U / UR                  Q/ U R                  QS S PUR                  S95        UR                  UR                  5        M     U R                  U R                  U5      5      nUR                  (       a0  [        UR                  5      [        U R                  5      -  (       d   e[        X5       VVs/ s H  u  pS[!        U5       SU S3PM     n	nnU R#                  U R%                  U R&                  S   R(                  5      5      n
UR*                   S3nU S3nU R,                  R/                  U S	U
 S
35        [        U R                  5      S:X  d   S5       e/ U R                  Qu  nU[1        U5      0nU[1        U5      S-
  0n[        U5      S:X  a(  S nUR3                  U5      /nUR3                  U5      /nOwS n[5        [        U5      5       Vs/ s H  nUR3                  U5      SU S3-   PM     nn[5        [        U5      5       Vs/ s H  nUR3                  U5      SU S3-   PM     nnU R,                  R/                  U SU" U	5       35        [6        R8                  " [;        [=        5       5      5         U" UU5      nS S S 5        U R,                  R/                  UR3                  U5       SU" W5       35        [        U5      S:X  a  U4$ U Vs/ s H#  nU R                  U R                  U5      5      PM%     nn[?        U5       H*  u  nnU R,                  R/                  U SU SU S35        M,     [A        U5      $ s  snnf s  snf s  snf ! , (       d  f       N= fs  snf )Nr$   r"  ra   rb   r9   rO  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                    U S   $ rQ  rR   r  s    rE   maybe_tuple&HalideKernel.scan.<locals>.maybe_tupleI  s    trG   c                ,    SSR                  U 5       S3$ )Nz
hl.Tuple([rb   r  )r3  r  s    rE   rs  rt  P  s    #DIIaL>44rG   r1  r2  r  )!rr  rt   r   r   rZ  r:   r  r#  ry  r  r  r  r)  r8  r*  r  r   r  r  r  r  r  r  r  r    rG  r  r#   rX  r   r   r  rS  )rN   dtypesr`  values_origr(  all_used_dimsr  r\  r   initialr  scan_domscanscan_varscan_renames_curscan_renames_prirs  	read_left
read_rightr  ra  rN  unpack_varsr  s                           rE   r{  HalideKernel.scan  s1    $$$$6{c+....*,"5<<02 Ee%677EOO<WWW%//*Z8N8N-OOe$LL 'I%//I+DT-C-C+DRa+HI#kk !    1 ! \\$"5"5m"DE
##
:3G3G(H:""L
 )
 	
 

 !$F 3
 3 u-.bq9 3 	 

 D001A1A"1E1K1KLM oo&e,2		xj(@LM4))*a/ 	
4	
/ 0../$&8&>?$&8&>&BCv;! $,,-=>?I$--.>?@J5
 s6{++A ##$45!A3a@+   s6{++A ##$45!A3a@+  
 			zl#k'.B-CDE /@AB$Y
;K C		""#345S[9Q8RS	
 v;!= QWXQWAt||D$7$7$FGQWXk*DAqII1#SAaS :; +[!!k
: CB Ys$   P19"P73"P<$
Q*Q
Qr   c                   U R                   R                  U R                  XUS9n[        U[        5      (       d   eX%l        U$ )Nr   )r=  generater  r:   r  r#  )rN   r  r#  r  r  r  s         rE   r  HalideKernel.genfuncm  sA     hh		4eL#01111!
rG   r"  c               r    U R                   R                  US9n[        U[        5      (       d   eXl        U$ r!  )r=  newvarr:   r  r#  )rN   r#  r  r  s       rE   r8  HalideKernel.newfuncz  s4    hhooEo*#01111!
rG   c                x    [         R                  R                  U5      R                  5       R	                  5       $ )z
We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
PyTorch's numel excludes them.
)r#   rR  
get_buffer
get_layoutstorage_sizer?  s     rE   halide_buffer_numel HalideKernel.halide_buffer_numel  s+     ww!!$'224AACCrG   c                  ^ S n/ nU R                   R                  5       u  p4pS[        [        XE5      US9 H  u  nmUR	                  UT45        [        T[        5      (       d  M0  TR                  S:X  a  TR                  b   eUR                  U4S jU R                  R                  TR                  S5       5       5        M     U$ )zH
Halide requires scalar inputs before outputs, so need to reorder args.
c                ~    U u  p[        U[        5      (       a  gSUR                  ;   a  gSUR                  ;   d   eg)Nr$   out_ptrr   in_ptrr   )r:   r,   r  )	arg_tuple	_call_strr-  s      rE   	arg_order.HalideKernel.halide_argdefs.<locals>.arg_order  s=    &NI#w''chh&388+++rG   r  r   c           
   3     >#    U  H<  nS [        UTR                  TR                  TR                  TR                  S94v   M>     g 7f)N)alias_of)r-   bufferr   r  r  )r@  aliasr-  s     rE   rA  .HalideKernel.halide_argdefs.<locals>.<genexpr>  sJ       "G !!JJIIJJ%(XX	 "Gs   AArR   )ru   python_argdefsr  r  r  r:   r-   r  r  r  r|  rE  r  )rN   r  r	  rN  r   r   call_strr-  s          @rE   halide_argdefsHalideKernel.halide_argdefs  s    
	 =?YY--/
a#CI9=MHcMM8S/*#y))zzQ3<<+???  "&!4!4!8!82!F 	 >" rG   c                .   / nU R                  5        GH=  u  p#[        U[        5      (       a	  SnSnSnSnOU R                  UR                      Vs/ s H'  n[        U R                  UR                  5      5      PM)     nnU R                  UR                      Vs/ s H'  n[        U R                  UR                  5      5      PM)     nn[        U5      [        U5      :X  d   e[        U R                  UR                     5      n[        UR                      S3nUR                  [        UUR                  UUUUR                  S95        GM@     [         R"                  R%                  5       n	U	R&                  S:X  aE  [(        R*                  R,                  /n
[(        R*                  R.                  nS[1        5       0nSnGOU	R&                  S:X  d   S5       eU	R2                  S	::  d   S
5       e[(        R*                  R4                  /n
[(        R*                  R6                  n[8        R:                  R=                  U	5      nSU
S	   ;  aF  S H@  u  nnUR>                  U:  d  M  UR@                  U:  d  M*  U
R                  SU U 35          O   U
R                  S5        SURB                  0n[E        S	U	R2                  5      nU
R                  S5        U
R                  S5        [(        R*                  RF                  (       d  U
R                  S5        [(        R*                  RH                  (       a  U
R                  S5        SU RJ                  ;   a  U
R                  S5        [M        USRO                  U
5      UUUS9$ s  snf s  snf )z)Compute metadata required by codecache.pyNlongrP  )r  rO  r  r  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r  r   )      )r  r   )r  r$   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)(r  r:   r,   ru  r  r/   r  r  rO  rt   rv  r.   r   r  r   r  r#   rR  get_current_device_or_throwtyper   r  
cpu_targetscheduler_cpur   r  
gpu_targetscheduler_cudar<   r  get_device_propertiesmajorminormulti_processor_countr@   assertsr  rd   r   r3  )rN   argtypesrN  r-  r  rO  r  r   r   current_devicer  r  r  r  
capabilityr  r  s                    rE   halide_kernel_metaHalideKernel.halide_kernel_meta  s   ))+FA#w'' "33CHH== $..qvv67=   "33CHH== $..qxx89=   5zS[000t22388<='		2315OOHH!! \\	% ,: <<>%'mm../F33I35O K!&&&0K2KK0!''1,M.MM,mm../F44I99.IJ q	1$LLE5!''50Z5E5E5N(8w&GH %M MM.)z??O
 a!5!56K 	n% 	l#}}$$MM,'==MM'"4###MM/*88F#+#
 	
Cs   .N.Nc                
  ^  T R                   R                  (       a  [        S5      eT R                  5       n[	        5       nUR                  SSS9  UR                  5         T R                  5        H  u  pE[        U[        5      (       a-  UR                  UR                   ST R                   S35        MG  UR                  (       d   U5       eSUR                  ;   a  SOS	n[        UR                  5      n[!        T R"                  UR                     5      nUR                  UR                   S
U SU SU S35        M     UR                  S5        UR                  5         T R                  5        H/  u  pEUR                  UR                   SUR                   35        M1     T R                   R%                  5        H  u  pUR                  U	 S
U
 35        M     UR                  T R&                  5        U 4S jnT R(                  R*                   HH  n[        U[,        5      (       a  [.        R0                  R3                  X5      nUR                  U5        MJ     UR                  S5        UR                  S5        T R                  5        GH  u  pE[        U[        5      (       aU  [4        R6                  R8                  R;                  UR<                  SS9nUR                  UR                   SU S35        Mp  T R"                  UR                     n/ n[?        U5       GH  u  nnT RA                  [4        R6                  R8                  R;                  URB                  SS9U5      nURE                  SU S35        SUR                  ;  d  Mp  UR                  UR                   SU S35         UR                  UR                   SU S[G        URH                  5       S35         UR                  UR                   SU S[G        URB                  5       S35        GM     UR                  UR                   SSRM                  U5       S35        GM     URO                  S5        UR                  SRQ                  5       5        URR                  (       am  UR                  S[T        RV                  " URR                  5      < SURX                  < S URR                  < SURZ                  < S!3	SS9  UR]                  5       $ UR                  S"URX                  < S#3SS9  UR]                  5       $ ! [J         a     GN\f = f! [J         a     GM7  f = f)$z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r9   outzhl.OutputBufferzhl.InputBufferr  r   rb   z&
            def generate(g):
        z = g.c                   > [        [        TR                  R                  U R	                  S5         5      nUR
                  c   U5       e[        U5      $ )Nr$   )r   r  r=  r>  r  r#  r  )r  r  rN   s     rE   update_index1HalideKernel.codegen_kernel.<locals>.update_index&  sE    ($((*>*>qwwqz*JKC==,1c1,s8OrG   r<  zassert g.using_autoscheduler()r$   r  z.set_estimate(r  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([r  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/ru   r  rI   r  r(   splice	do_indentr  r:   r,   r  r  rd   r  r   r   rt   ru  aliasesr  r  _linesr  r  rK  r  r#   rR  rS  r  rf   r  _autoscheduler_workaroundsr  r  r;   rO  re  r3  do_unindentrstripr  r   find_libautoscheduler  r  getvalue)rN   r  metacoderN  r-  argclsargtypendimr  r   r  r  hintr5  range_hintsr  dims   `                 rE   codegen_kernelHalideKernel.codegen_kernel  s   99$$/00&&(  	 
	
 	))+FA#w''#((+=d>N>N=OqQRzz&3&z.3sxx.?*EU%cii0411#((;<#((3vhay4&JK , 		

 	))+FANNchhZuSXXJ78 ,		))+HCNNcU#cU+, ,D&&'	
 II$$D$$$(5599,MNN4 	 %
 	r78))+FA #w''ww''11#((Q1G#((>$qAB--chh7 'oFAs::((22388a2H$D  &&dV1'=>CHH,#((5<'HI! NN#&88*E!M#cjj/ARRS T
! NN#&88*E!M#chh-PQ R .( #((+;DIIk<R;SSUVW= ,@ 	 		
 >>KK$$3$H$H$X#[ \((, 7<<@NN;MRPTPdPdOg h	  #  8 }} KK::>++ I
    }}]  ) ! !  ) ! !s$   7U97U
UU
U'&U'c                    [        U5      S:X  aV  [        R                  R                  S:X  a8  [        R
                  R                  5       R                  S:X  a  [        SU 5      n U $ )Nr$   Anderson2021r  r   )	rt   r   r  r  r#   rR  r  r  r@   )r   r5  s     rE   r  'HalideKernel._autoscheduler_workaroundsy  sN     IN,,>335::fD Aq	ArG   c                   [         R                  R                  nU R                  5        VVs/ s H  u  pVUR                  b  M  U PM     nnn[         R                  R                  5       nUR                  S:X  aE  UR                  UR                  [         R                  R                  5      n	UR                  U	5        UR                  UUUSS9  gs  snnf )zCodegen a call to this kernelNr  F)devicetriton)r#   rR  wrapper_coder  r  r  r  write_get_raw_streamr  r  r  generate_kernel_call)
rN   r  r  deallocate_wswrapperr   r-  	call_argsr  stream_names
             rE   call_kernelHalideKernel.call_kernel  s    ''&&*.*=*=*?X*?3<<VsV*?	X<<>&(!66$$aggllK [)$$!	 	% 	
 Ys   CCc                    gr_  rR   )rN   r  s     rE   generate_assertHalideKernel.generate_assert  s    rG   c                    g r   rR   )rN   rf   r  loweruppers        rE   check_boundsHalideKernel.check_bounds  s     	rG   )r|  ru  rv  rn  rz  rw  r}  rt  rx  rq  ro  rs  ry  rp  )r~  zdict[str, sympy.Expr]rT   rU   )r   r  rT   r  )NNN)r  zSequence[sympy.Expr])r  rN  )r  r  r  rN  r  r   r_  )r  r  r  rN  )r  r  r   )
r  r  r  rN  r  r&   rE  r6   rT   rU   )
r   r  r   r  r  r5   r  +Union[CSEVariable, tuple[CSEVariable, ...]]rT   r  )rv  ztuple[torch.dtype, ...]r`  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]rw  tuple[CSEVariable, ...]rT   r  )r  r7   rT   r  )rT   r   r  )r  r  r  r   )rf   rN  r  rN  r  r   r  r   ),rV   rW   rX   rY   r   	overridestexprr  r`  rM   r  r  r  r  r  r  r  r  r  r  r  r*  r2  r  r[  rH  rc  rT  r{  r   unknownr  r8  r  r  r  r  r   r  r  r  r  rZ   r[   r\   s   @rE   rl  rl    s   I).E&.+%+ 
	+6"=
l\*BNN%f:P(
..
%HN? SW66 *63>6FO6	6:>> > &	>
 ;> 
5>@$LS"'S"
S" -S" 
!S"t ""$ $  
 =A D"HQ
fxt  
"&09=FJ rG   rl  c                  2    \ rS rSr\r\SS j5       rS rSr	g)HalideSchedulingi  c                    [        [        R                  [        R                  [        R                  /5      n[
        R                  R                  (       a  UR                  [        R                  5        U$ r   )
r   r%   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   r  scan_kernelsr  SCAN)r  r  r	  s      rE   get_backend_features%HalideScheduling.get_backend_features  sR    ..6677
 ==%%JJ~**+rG   c                   [         R                  R                  nXR                  ;   a  UR                  U   nU$ SUR	                  5        3nXTR                  U'   UR                  S5        [        5       nUR                  SUR                  5       < S35        UR                  USS9  UR                  S5        [        X$5      u  pxU SU 3n	UR                  XVR                  5       U	5        [        S	5      (       a  [        US
U5        U$ )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr  z''')
kernel_metadatar<  )r#   rR  r  src_to_kernelnext_kernel_suffixadd_import_oncer(   r  r  r  r   define_kernelr  r   r   )
rN   src_codenode_schedulerc   r  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_comments
             rE   r  HalideScheduling.define_kernel  s   ''&&,,,!//9K. + +7+E+E+G*HIK.9!!(+##W -.O%%'(A(A(C'FeL ""84"8%%f-(;M(S%G")"-=,>?!!5579I ''899#KX>rG   rR   N)r  ztorch.devicerT   zOrderedSet[BackendFeature])
rV   rW   rX   rY   rl  kernel_typer  r  r  rZ   rR   rG   rE   r  r    s    K
 
rG   r  )t
__future__r   dataclassesr  r&  loggingrI  collectionsr   mathr   typingr   r   r   r	   r
   r   r<   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r<  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r   r    r!   virtualizedr"   r  r#   commonr%   r&   r'   r(   r)   r*   r+   r,   r-   cppr.   	cpp_utilsr/   simdr0   r1   r2   collections.abcr3   r4   r5   r6   shape_propagationr7   	getLoggerrV   r  rF   RuntimeErrorrI   r^   r   r  pexprr   r   r   r   float64r  int16r   r>   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesr  	dataclassrM  rg  r  rl  r  rR   rG   rE   <module>r<     s   "     	 #  < <    - , ? 7 4  ' ) B ) 7  )
 
 
   ; ; 262!
F, F
QM QD 	 
JJ	NNO	MM>	MM>	MM>	JJ	KK	KK	KK	KK	LL-	LL-	LL-"V"k V"r
  / / 9%P %PP + + +@
 f: fR+~ +rG   