
    ȅi                       S SK Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJ	r	J
r
Jr  S SKrS SKJr  S SKrS SKJr  S SKJr  S SKJrJr  S S	KJr  S
SKJrJrJr  S
SKJrJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+J,r,  \(       a  S SKJ-r-  S
SK.J/r/J0r0  S
SK1J2r2J3r3  SSK"J4r4  \Rj                  " \65      r7\Rp                  S\Rr                  S\Rt                  S\Rv                  S\Rx                  S\Rz                  S\R|                  S\R~                  S\R                  S0	rAS'S jrB " S S\5      rC " S  S!\'5      rD\DR                  S"5        \DR                  5          " S# S$\+5      rG " S% S&\,5      rHg)(    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    [        U [        5      (       a<  U [        R                  :X  a  gU [        R                  * :X  a  gX :w  a  g[	        U 5      $ [        U [
        5      (       a  U (       a  S$ S$ [	        U 5      $ )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfstrr%   )vals    U/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr9   8   sc    #u%))UYYJZ3x	C		v)')s8O    c                      \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSS jrSS jrSS jrSS jr\rSS jrSS jrSS jrSrg)MetalExprPrinterF   z/Converts sympy expression to Metal code snippetc                    UR                   u  p#U R                  U5      nU R                  U5      nUR                  (       a	  SU SU S3$ SU SU S3$ )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r8   _print_FloorDiv MetalExprPrinter._print_FloorDivI   sY    LLOll3??.qcC5::qcse1--r:   c                    UR                   u  p#nU R                  U5      nUS:w  a5  U R                  U5      nUR                  (       a
  SU SU S3nO	SU SU S3nU R                  U5      nSU SU S3$ )Nr   (rC   rA   rB   z) % (rD   )rH   rI   rJ   rK   mods        r8   _print_ModularIndexing'MetalExprPrinter._print_ModularIndexingQ   s    iiLLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r:   c                    [        UR                  5      S:w  a  [        S5      e[        U R                  UR                  5      u  p#SU SU SU S3nSU SU SU S3nSU SU S3$ )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(rA   zmetal::min(r@   lenrE   RuntimeErrormap_printrH   rI   ab
typecast_a
typecast_bs         r8   
_print_MinMetalExprPrinter._print_Min]   }    tyy>QEFF4;;		*,QCq3qc;
,QCq3qc;
ZL:,a88r:   c                    [        UR                  5      S:w  a  [        S5      e[        U R                  UR                  5      u  p#SU SU SU S3nSU SU SU S3nSU SU S3$ )	Nr   z$metal::max only supported for 2 argsrT   rU   rV   rA   zmetal::max(r@   rW   r\   s         r8   
_print_MaxMetalExprPrinter._print_Maxe   rc   r:   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   metal::abs(r   rA   rX   rE   r[   rH   rI   s     r8   
_print_AbsMetalExprPrinter._print_Absm   s9    499~"""T[[167q99r:   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   zstatic_cast<long>(metal::rint(r   ))ri   rj   s     r8   _print_RoundToInt"MetalExprPrinter._print_RoundToIntq   s9    499~"""/DIIaL0I/J"MMr:   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  S	3$ )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1erA   )rX   rE   rG   
ValueErrorparenthesizer   )rH   rI   numberndigits
number_strs        r8   _print_RoundDecimal$MetalExprPrinter._print_RoundDecimalu   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r:   c                l    UR                   u  p#SU R                  U5       SU R                  U5       S3$ )Nstatic_cast<float>(z) / static_cast<float>(rA   )rE   r[   )rH   rI   lhsrhss       r8   _print_IntTrueDiv"MetalExprPrinter._print_IntTrueDiv   s9    99$T[[%5$66MdkkZ]N^M__`aar:   c                    [        UR                  5      S:X  d   e[        U R                  UR                  5      u  p#SU SU S3$ )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rn   )rX   rE   rZ   rF   )rH   rI   rJ   ys       r8   _print_PowByNatural$MetalExprPrinter._print_PowByNatural   sD    499~"""4<<+/s2H2NNr:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   r}   rA   rX   rE   rF   rH   rI   rJ   s      r8   _print_ToFloatMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r:   c                b    UR                   (       a  [        [        U5      5      $ [        U5      $ N)rG   r6   r(   rj   s     r8   _print_FloatMetalExprPrinter._print_Float   s#    ?? s4y>!t9r:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r   r   s      r8   _print_FloorToInt"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   zstatic_cast<int>(metal::trunc(rn   r   r   s      r8   _print_TruncToInt"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   zmetal::log2(rA   r   r   s      r8   _print_OpaqueUnaryFn_log2*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&aS""r:   c                J   ^  U 4S jUR                    5       u  p#nU SU SU 3$ )Nc              3  \   >#    U  H!  nTR                  U[        S    S-
  5      v   M#     g7f)Atomg      ?N)rv   r   ).0argrH   s     r8   	<genexpr>0MetalExprPrinter._print_Where.<locals>.<genexpr>   s.      
HQDc:f#5#;<<	s   ), ?  : )rE   )rH   rI   cpqs   `    r8   _print_WhereMetalExprPrinter._print_Where   s3    
HL		
a Cs#aS!!r:    N)rI   
sympy.Exprreturnr6   )__name__
__module____qualname____firstlineno____doc__rL   rQ   ra   re   rk   ro   rz   r   r   r   r   r   _print_floorr   r   r   __static_attributes__r   r:   r8   r<   r<   F   s\    9.
"99:N
]b
O
*
J
 %L6
#
"r:   r<   c                     \ rS rSrSr\  S2         S3S jj5       r\        S4S j5       r\S5S j5       r\S6S j5       r	\S7S j5       r
\S8S	 j5       r\S9S
 j5       r\S:S j5       r\S:S j5       r\S:S j5       r\S:S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S<S j5       r\S;S j5       r\S;S j5       r \S;S j5       r!\S;S  j5       r"\S;S! j5       r#\S:S" j5       r$\S;S# j5       r%\S;S$ j5       r&\S:S% j5       r'\S;S& j5       r(\S:S' j5       r)\S;S( j5       r*\S=S) j5       r+\S=S* j5       r,\          S>S+ j5       r-\S;S, j5       r.\S:S- j5       r/S?S. jr0S@S/ jr1\2SAS0 j5       r3S1r4g)BMetalOverrides   zXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                    U[         R                  :X  a  [        R                  S5        SU  S3$ S[        U    SU  S3$ )Nz>float64 cast requested, probably from tensorify_python_scalarsr}   rA   static_cast<>()r4   doublelogwarningDTYPE_TO_METAL)rJ   dtype	src_dtypeuse_compute_typess       r8   to_dtypeMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r:   c                6    S[         U    S[         U    SU  S3$ )Nzas_type<z>(static_cast<r   rn   r   )rJ   r   r   s      r8   to_dtype_bitcastMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar:   c                    [        U 5      $ r   r9   )r7   r   s     r8   constantMetalOverrides.constant   s    c""r:   c                :   [         R                  R                  [         R                  R                  U 5      5      n[         R                  R                  R                  [         R                  R                  U[        U 5      S9n[        R                  " X15      $ )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rI   r   idx_strvars       r8   
index_exprMetalOverrides.index_expr   sj    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r:   c                    [         R                  R                  X5       nU" 5       nS S S 5        WR                  R                  (       a  [        U5      n[        R                  " WXB5      $ ! , (       d  f       NK= fr   )r   r   
mask_loadsr   is_boolr%   r   where)maskbodyothernew_maskresults        r8   maskedMetalOverrides.masked   sV     XX  -VF . ==  KEyy611 .-s   A--
A;c                (    U  SU S[        U5       3$ )Nr   r   r   )r]   r^   r   s      r8   r   MetalOverrides.where   s    Cs#nQ/011r:   c                    SU  SU S3$ )Nzc10::metal::remainder(r@   rA   r   r]   r^   s     r8   	remainderMetalOverrides.remainder   s    's"QCq11r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zc10::metal::max(r@   r   r]   r^   r_   r`   s       r8   maximumMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zc10::metal::min(r@   r   r   s       r8   minimumMetalOverrides.minimum   r   r:   c                    U  SU 3$ )Nz || r   r   s     r8   
logical_orMetalOverrides.logical_or       D}r:   c                    U  SU 3$ )Nz && r   r   s     r8   logical_andMetalOverrides.logical_and   r   r:   c                    SU  S3$ )Nzmetal::isnan(rA   r   rJ   s    r8   isnanMetalOverrides.isnan       qc##r:   c                    SU  S3$ )Nzmetal::isinf(rA   r   r   s    r8   isinfMetalOverrides.isinf   r   r:   c                    SU  S3$ )Nzmetal::log(rA   r   r   s    r8   r   MetalOverrides.log      QCq!!r:   c                    SU  S3$ )Nzmetal::exp(rA   r   r   s    r8   expMetalOverrides.exp  r   r:   c                    SU  S3$ )Nrh   rA   r   r   s    r8   absMetalOverrides.abs  r   r:   c                    SU  S3$ )Nzmetal::signbit(rA   r   r   s    r8   signbitMetalOverrides.signbit  s     1%%r:   c                    SU  S3$ )Nzmetal::precise::sin(rA   r   r   s    r8   sinMetalOverrides.sin      %aS**r:   c                    SU  S3$ )Nzc10::metal::sinc(rA   r   r   s    r8   sincMetalOverrides.sinc  s    "1#Q''r:   c                    SU  S3$ )Nzmetal::precise::cos(rA   r   r   s    r8   cosMetalOverrides.cos  r
  r:   c                    SU  S3$ )Nzmetal::tan(rA   r   r   s    r8   tanMetalOverrides.tan  r   r:   c                    SU  S3$ )Nzmetal::asin(rA   r   r   s    r8   asinMetalOverrides.asin#      aS""r:   c                    SU  S3$ )Nzmetal::acos(rA   r   r   s    r8   acosMetalOverrides.acos'  r  r:   c                    SU  S3$ )Nzmetal::atan(rA   r   r   s    r8   atanMetalOverrides.atan+  r  r:   c                    SU  SU S3$ )Nz::metal::atan2(r@   rA   r   )rJ   r   s     r8   atan2MetalOverrides.atan2/  s     2aS**r:   c                    SU  S3$ )Nzmetal::sqrt(rA   r   r   s    r8   sqrtMetalOverrides.sqrt3  r  r:   c                    SU  SU  S3$ )NrT   z)>(-rA   r   r   s    r8   negMetalOverrides.neg7  s     'qcaS22r:   c                    SU  S3$ )Nzmetal::rsqrt(rA   r   r   s    r8   rsqrtMetalOverrides.rsqrt=  r   r:   c                    SU  S3$ )Nzmetal::tanh(rA   r   r   s    r8   tanhMetalOverrides.tanhA  r  r:   c                    SU  S3$ )Nzmetal::atanh(rA   r   r   s    r8   atanhMetalOverrides.atanhE  r   r:   c                    SU  SU S3$ )Nr?   r@   rA   r   r   s     r8   floordivMetalOverrides.floordivI  s     +1#Rs!44r:   c                    SU  S3$ )NrB   rA   r   r   s    r8   floorMetalOverrides.floorN  r   r:   c                    SU  S3$ )Nzmetal::sign(rA   r   r   s    r8   signMetalOverrides.signR  r  r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zmetal::fmod(r@   r   r   s       r8   fmodMetalOverrides.fmodV  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r:   c                    SU  S3$ )Nmetal::trunc(rA   r   r   s    r8   truncMetalOverrides.trunc\  r   r:   c                    U  SU 3nU R                   b  U R                   R                  (       d(  UR                   b!  UR                   R                  (       a  SU S3$ U$ )Nz / r=  rA   )r   is_floating_point)r]   r^   quots      r8   truncdivMetalOverrides.truncdiv`  sQ    Cs|GGAGG$=$=GGAGG$=$="4&**r:   c                    SU  S3$ )Nzmetal::ceil(rA   r   r   s    r8   ceilMetalOverrides.ceili  r  r:   c                f    [         R                  R                  R                  S5        SU  SU S3$ )Nrandomzc10::metal::rand(r@   rA   r   r   headersaddseedoffsets     r8   randMetalOverrides.randm  s/    	X&"4&6(!44r:   c                f    [         R                  R                  R                  S5        SU  SU S3$ )NrI  zc10::metal::randn(r@   rA   rJ  rM  s     r8   randnMetalOverrides.randnr  s/    	X&#D6F8155r:   c           	     r    [         R                  R                  R                  S5        SU  SU SU SU S3	$ )NrI  zc10::metal::randint64(r@   rA   rJ  )rN  rO  lowhighs       r8   	randint64MetalOverrides.randint64w  s=     	
X&'vRxr#baHHr:   c                    SU  S3$ )Nzmetal::rint(rA   r   r   s    r8   roundMetalOverrides.round~  r  r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zmetal::pow(r@   r   )r]   r^   cast_acast_bs       r8   powMetalOverrides.pow  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r:   c                f    [         R                  R                  R                  S5        SU SU S3$ )Nspecial_mathc10::metal::rO   rA   rJ  )rH   r]   names      r8   _special_unaryMetalOverrides._special_unary  s/    	^,dV1QCq))r:   c                l    [         R                  R                  R                  S5        SU SU SU S3$ )Nrc  rd  rO   r@   rA   rJ  )rH   r]   r^   re  s       r8   _special_binaryMetalOverrides._special_binary  s5    	^,dV1QCr!A..r:   c           
        S H,  n[        X[        R                  " U R                  US95        M.     [        R                  " U R                  SS9U l        S H0  n[        U U[        R                  " U R                  US-   S95        M2     S H,  n[        X[        R                  " U R
                  US95        M.     S H0  n[        U U[        R                  " U R
                  US-   S95        M2     g )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)re  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammaigammaigammaczeta)
chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_heshifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_w)setattr	functoolspartialmethodrf  lgammari  )clsre  s     r8   _initialize_special_ops&MetalOverrides._initialize_special_ops  s    	
D Cy66s7I7IPTUV	
 ,,S-?-?kR

D ''(:(:
ARS
&
D Cy66s7J7JQUVW

D ''(;(;$BST
r:   r   NT)
rJ   r   r   torch.dtyper   zOptional[torch.dtype]r   r%   r   r6   )rJ   r   r   r  r   r  r   r6   )r7   zUnion[bool, float, int]r   r  r   r6   )rI   r   r   r  r   r6   )r   r   r   r   r   r   r   r6   )r]   r$   r^   r$   r   r$   r   r6   )r]   r$   r^   r$   r   r6   )r]   r   r^   r   r   r6   )rJ   r   r   r6   )rJ   r   r   r   r   r6   )rN  r   rO  r   r   r6   )
rN  r   rO  r   rV  r   rW  r   r   r6   )r]   r   re  r6   r   r6   )r]   r   r^   r   re  r6   r   r6   r   None)5r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r"  r%  r(  r+  r.  r1  r4  r7  r:  r>  rC  rF  rP  rS  rX  r[  r`  rf  ri  classmethodr  r   r   r:   r8   r   r      s   b ,0"&	<<< )<  	<
 
< < bb*b7Bb	b b
 # # ( ( 2 2 2 2 2 2 > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + " " # # # # # # + + # # 3 3
 $ $ # # $ $ 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I # # 1 1
*/ = =r:   r   mpsc                  
  ^  \ rS rSr% Sr\rSrSrSr	Sr
\" 5       R                  r\" 5       R                  r\" 5       R                  r\r\" S/5      rS\S	'   / rS
\S'         SU 4S jjrSS jrSS jr S         S S jjrS!S jrSSS\R:                  " 5       4           S"S jjr          S#S jr          S#S jr S$S jr!S%S jr"SS&S jjr# S'       S(S jjr$          S)S jr%Sr&U =r'$ )*MetalKerneli  z;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]rK  zlist[IterationRangesEntry]multistage_reduction_entryc                \   > [         TU ]  " U40 UD6  [        R                  " 5       U l        g r   )super__init__	itertoolscountacc_var_ids)rH   tilingkwargs	__class__s      r8   r  MetalKernel.__init__  s&    
 	*6*$??,r:   c                    [         U   $ r   r   )rH   r   s     r8   dtype_to_strMetalKernel.dtype_to_str  s    e$$r:   c                   U R                   R                  U5      nU R                  U5      n[        R                  R                  U5      nU SU R                  U5       S3nU[        R                  [        R                  4;   a  SU S3n[        R                  nU R                  R                  U R                  XTS9$ )z"Codegen a load from an InputBuffer[]r}   rA   r   )rE   inputr   r   graph	get_dtyper   r4   float16bfloat16float32r   r   loads)rH   re  indexr   r   lines         r8   loadMetalKernel.load  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r:   Nc                r   U R                   R                  U5      nU R                  U5      nU R                  [        R
                  R                  U5      5      nSU SU S3nUc  U SU R                  U5       SU S3nO\US:X  aH  U R                  R                  S5        S	U S
3n	SU	 SU S3n
U	 SU
 SU R                  U5       SU S3nO[        SU 35      eU R                  (       a%  U R                  R                  [        X5      5        g U R                  R                  [        X5      5        g )Nr   r   rA   r  ] = r  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r@   );zUnimplemented store mode )rE   outputr   r  r   r  r  r   rK  rL  rY   inside_reductionr   	writeliner   stores)rH   re  r  valuemoder   	dtype_strcast_valr  atomic_typecast_vars              r8   storeMetalKernel.store  s8    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<#;<KK!!,t":;r:   c                   U R                   R                  U5      nU R                  U5      nU R                  [        R
                  R                  U5      5      n[        S U R                   5       5      nU SU R                  U5       SU SU S3nSUR                   SU 3nU R                  R                  [        X5      5        g )Nc              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   is_reductionr   ts     r8   r   .MetalKernel.store_reduction.<locals>.<genexpr>  s     K(81NNQQ(8   #	#r  z] = static_cast<r   r  if (z == 0) )rE   r  r   r  r   r  r  nextrange_treesr   re  r  r  r   )rH   re  r  r  r   r  reduction_dimr  s           r8   store_reductionMetalKernel.store_reduction  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r:   Tc                   [        U[        R                  5      (       a  U R                  U5      nS[	        U R
                  5       3n[        R                  R                  XeU5      nU(       a  SOSnX SU 3-  nU(       a  USU R                  U5       S3-  nUb  U(       a   S5       eUSU 3-  nU R                  R                  XR                  -   5        U$ )	Ntmp_acc_zthreadgroup   r  r  z+Thread group var can not have default value = )r3   r4   r   r  r  r  r   r   create_cse_varsexprindexing_coder  suffix)	rH   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r8   _new_idxvarMetalKernel._new_idxvar  s     eU[[))%%e,Ed4#3#3456hh%%h>$2.WAhZ((4::j12!44G$%T'TT%]O,,G$$W{{%:;
r:   c                    X#U4nXPR                   R                  ;   a  U R                   R                  U   $ U R                  XX45      nX`R                   R                  U'   U$ )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rH   r   r   reduction_typer  	cache_keyr   s          r8   	reductionMetalKernel.reduction3  s\     6	00088++I66((>Q.4  +r:   c                   U R                   (       d   eU R                  (       a   eSAS jnSnSnU R                   H  nUR                  (       d  M  U(       a  US-  nXhR                   SU 3-  n[        UR                  [        R                  5      (       a  XxR                  -  nMn  U[        R                  " UR                   S3SSS9-  nM     [        R                  " XpR                  5      nU R                  U5      n	[        U[        R                  5      (       a  [        XpR                  5      OU R                  n
US	:X  a  U R!                  U5      nU R"                  R%                  U S
35        U R"                  R%                  S5        U R&                  R)                  SU SU S35        U R*                  R%                  S5        U$ U R,                  R/                  S5        US;   a  [0        U   nU R!                  X5      nU R2                  (       d  UnOAUS:X  a  SOSu  nnU R!                  XSS9nU R&                  R)                  U SU SU S35        U R4                  R7                  U R*                  SU SU SU SU SU	 S3[0        U   S9$ US;   a  U R!                  X*5      n[8        U   nSU S U S3nU R2                  (       d  UnO[UR;                  S!5      (       a  S"OS!nS#U S$U S%3nU R!                  UUSS9nU R&                  R)                  U S&U SU SU S'35        U R4                  R7                  U R*                  SU SU SU SU SU	 S3[0        U   S9$ US(;   Gab  U R!                  X*5      nU R!                  X5      n[8        U   nSU S U S3nU R2                  (       d  UnS[8        U    S U S3nOUR;                  S!5      (       a  S"OS!nS#U S$U S%3nU R!                  UUSS9nU R!                  US)SS9n[=        S* U R>                  RA                  5        5       5      nUS+:X  a  S,OS-nURB                  (       a  S.U S/3OSnU R&                  R)                  SU SU SU U SU S0U S1U S0UR                   S235        U R4                  R7                  U R*                  SU SU SU SU SU SU SU	 S3US9$ US3:X  Ga)  U R2                  (       dz  U R!                  X'5      nU R&                  R)                  U S4U S5U S35        U R4                  R7                  U R&                  SU SU SU	 S3[D        RF                  S9nU" U5      $ U R!                  S6U5      nU S4U S73nU R"                  R)                  U S835        U R&                  R%                  U S9U S:U S;35        U R4                  R7                  U R*                  S<U SU S3[D        RF                  S9nU" U5      $ US=:X  Ga@  [        U[H        5      (       d   S>5       eU R!                  S6U5      nU S4U S73nS?US)    SUS    SUS@    S3nU R"                  R)                  U S835        U R2                  (       aC  U R"                  R)                  U S835        U R&                  R%                  U S9U SU S'35        O!U R&                  R%                  U S0U S35        U R4                  R7                  U R2                  (       a  U R*                  OU R&                  SU SU SU	 S3[D        RF                  S9nU" U5      $ [K        U5      e)Bz]Codegen a reduction operation.
Only sum and prod operations are somewhat reasonable optimizedc           
         [         R                  " S Vs/ s H)  n[        U  SU 3U R                  U R                  5      PM+     sn5      $ s  snf )Nxyzrr   )r   _unwrapr   r   r   )res3r  s     r8   _unwrap_helper6MetalKernel._reduction_nocache.<locals>._unwrap_helperO  sE    %%NSTevQqc]DKKDeT Ts   0Ar  r    + rt   numelTintegerpositiveanyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rU   )r   *F)r  r  r  z= r  zc10::metal::threadgroup_rO   r@   rA   r  )maxminr   r   r  lowestz::metal::numeric_limits<z>::z()z = ::c10::metal::r  )argminargmaxr   c              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   r  r  s     r8   r   1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      =!AA=r  r  r  <z || ::metal::isnan(z) r  z;
                    z$;
                }
                welford_reducer  r  float3r  z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   )r  r   r   ztuple[CSEVariable, ...])&r  
_load_maskr  r  re  r3   r  sympyIntegerSymbolprefixMinmax_threadgroup_sizer  r   simd_group_sizer  r  r  r   splicer  rK  rL  r   r  r   r   r   endswithr  range_tree_nodesvaluesrA  r4   r  tupleNotImplementedError)rH   r   r   r  r  r  reduction_idxacc_buf_sizerdacc_buf_size_strshmem_buf_sizeacc	acc_dtypeacc_bufr7   default_valreduction_opsrc_metal_type
cast_valuelim_fn	limit_valdata_acc_bufidx_acc_bufidx_validx_varcmp_op
nan_suffixwf_resacc_thread_var	inp_values                                 r8   r  MetalKernel._reduction_nocacheC  sT    $$$$??""	 ""B??&yL>::M"((EMM22(yyk'!  # yy/H/HI::l3 ,66 L"6"67%% 	 U"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&yAG22 !/% 7HX *\ && '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 ^+&&yAG+I6N''7r%BJ22 %3%<%<U%C%C6~6Fc&QST	&&Yu '  ##e,^,<AcU"ZLPRS 88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 11++IFL**5AK+I6N''7r%BJ22 ()>(?r-PQR%3%<%<U%C%C6~6Fc&QST	&&Yu '  **5RW*X #44;;=  !/( : !22 *%3 
 ## )G1VHAcU:, 7EUG $IS /%  88$$*>*:!L>K=XZ%r'"]O26F5GqJ	 %   --22**9C##wiqtE7!$LM**LL.~.>ayK[J\\]^-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU++U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9BGWFXXYZmm ' F
 "&))!.11r:   c                	  ^ U R                  TR                  5      nU R                  U5      nTR                  (       aW  [	        TR
                  R                  [        R                  5      (       a]  TR
                  R                  U R                  ::  a9  U R                  R                  U R                   STR                   SU S35        g [	        TR
                  R                  [        R                  5      (       a  TR
                  R                  O,[        R                  " TR
                  R                   S3SSS9n[!        U4S jU R"                   5       5      nTR
                  R                   S3nU(       Gdf  U R"                  R%                  T5        U['        U R                  S	-
  5      -   ['        U R                  5      -  nU R                  U5      nTR
                  R                  n	U R(                  R                  S
TR
                  R                   STR
                  R                   SU STR
                  R                   S3	5        U R(                  R+                  5          [	        U[        R                  5      (       aS  U R(                  R                  U R                   SU SU R                   STR
                  R                   SU	 S3
5        OHU R(                  R                  U R                   SU SU SU	 STR
                  R                   S3
5        [	        U[        R                  5      (       d  XpR                  -  U:w  a"  U R(                  R                  SU SU S35        UR-                  TR
                  R                  U5      n
U R(                  R                  U R                   STR                   SU
 S35        S S S 5        g U R(                  R+                  5          UR-                  TR
                  R                  U5      n
U R(                  R                  U R                   STR                   SU
 S35        S S S 5        g ! , (       d  f       g = f! , (       d  f       g = f)Nr  r  r  r  Tr  c              3  R   >#    U  H  oR                   TR                   L v   M     g 7fr   )root)r   eentrys     r8   r   =MetalKernel.codegen_iteration_ranges_entry.<locals>.<genexpr>  s       %
*IQFFejj *Is   $'_linear_idxr   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rt   z_cnt + r  z_cnt;r  z >= z) break;)rename_indexingrI   r  r  r3   r:  r  r  r  r  r  r  index_dtypere  r  r  r  r  appendr+   r   indentreplace)rH   r<  r   	index_stracc_sizeroot_already_processedlinear_idx_name	loop_sizeloop_size_str	root_namesub_index_strs    `         r8   codegen_iteration_ranges_entry*MetalKernel.codegen_iteration_ranges_entry  s   ))%**5
JJz*	!!uzz''77

  D$=$==((##$Aejj\YKqA  %****EMM:: JJ!2!2 3594RVW 	 "% %
*.*I*I%
 "
 #ZZ../{;%++2259 "E$*C*Ca*G$HHU))N I !JJy1M

IIIEJJ--.j9J9J8K7S`Raaefkfpfpfwfwex  yA  B !!#h55II''++,Ao->c445S9J9J8K7S\R]]^`
 II''++,Ao->c-PST]S^^abgblblbsbsattyz x66 #<#<<HII''$.?tH:X(VW !* 1 1%**//? S		##''(%**SqI) $#2 !!# ) 1 1%**//? S		##''(%**SqI $#3 $#2 $#s   6E,Q-AQ>-
Q;>
Rc                6   U R                   (       Ga8  U R                  R                  5          U R                  R                  U R                  5        U R                  R                  U R
                  5        SSS5        U R                  R                  S[        U R                   5      -  5        U R                  R                  [        S U R                  R                  R                  5        5       5      5        U R                   (       a;  U R                   R                  5       R                  5         U R                   (       a  M;  OJU R                  R                  U R                  5        U R                  R                  U R
                  5        U R                  R                  U R                  5        U R                  R!                  5         U R
                  R!                  5         U R                  R!                  5         g! , (       d  f       GN= f)z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
N}c              3  j   #    U  H)  n[        U[        5      (       a  UOU4  H  nUv   M	     M+     g 7fr   )r3   r  )r   itemvs      r8   r   +MetalKernel.codegen_body.<locals>.<genexpr>V  s7       A&0u&=&=dD7J J  As   13)r  r   rB  r  r  r   r  rX   r   
invalidater
   r  r  popcache_clearr  clear)rH   s    r8   codegen_bodyMetalKernel.codegen_bodyB  sc    ***!!#		  ,		  . $ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 111 IITZZ(IIT\\*		%

1 $#s   AH		
Hc                   U R                  5         [        5       n[        R                  R                  (       a  UR                  S5        OUR                  S5        U R                  5       nUR                  5          [        R                  R                  (       d)  U R                   H  nUR                  SU S35        M     OuU R                   Vs/ s H	  nSU S3PM     nn[        U[        [        5      R                  R                  R                  S-  /[        5       5      nUR                  U5        U R                  (       a|  [        R                   " S U R"                   5       5      n[%        U[&        R(                  5      (       a  [+        XpR,                  5      OU R,                  nUR                  SU S35        UR                  S	5        UR                  5          U R.                  R0                  R3                  5        H\  u  pXR4                  ;   a  M  U R7                  [        R                  R9                  U	5      5      nUR                  S
U SU
 S35        M^     U R.                  R:                  R3                  5        H  u  p[        R                  R9                  U	5      nU[<        R>                  :X  aD  [        R                  RA                  U	5      nUb  URC                  5       / :w  a  [E        S5      eSnOU R7                  U5      nUR                  SU SU
 S35        M     U R.                  RF                  RI                  5        H  n
UR                  SU
 S35        M     U HM  n[%        URJ                  [&        R(                  5      (       a  M.  UR                  SURL                   S35        MO     [O        U5      S:  d   S5       e[O        U5      S:  a  S[O        U5       3OSn[O        U5      S:X  a  US   RP                  OSnU R                  (       a  SOSnUR                  U SU SU 35        U R                  (       a  UR                  U S35        SSS5        UR                  S5        UR                  5          [O        U5      S:  aC  [S        U5       H4  u  nnUR                  SURP                   S[U        S U-   5       S!35        M6     URW                  U RX                  5        URW                  U RZ                  5        SSS5        UR                  S"5        SSS5        [        R                  R                  (       a!  UR                  S#5        UR]                  5       $ UR                  S$5        UR]                  5       $ s  snf ! , (       d  f       GN@= f! , (       d  f       N= f! , (       d  f       N= f)%z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''z#include <c10/metal/z.h>includec              3  ^   #    U  H#  oR                   (       d  M  UR                  v   M%     g 7fr   )r  r  r  s     r8   r   -MetalKernel.codegen_kernel.<locals>.<genexpr>  s      1%5GAGG%5s   --z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long& znumel,   z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {r  z = thread_pos.x   r  rO  z)MTL");z'''))/rX  r   r   r  cpp_wrapperr  active_range_treesrB  rK  r	   r   __file__parentr
   r  mathr  r  r3   r  r  r  r  rE   output_buffersitemsremoved_buffersr  r  input_buffersr4   float64try_get_bufferget_sizerY   sizevarsr  r  r  rX   re  	enumeratechrr  r  r   getvalue)rH   re  codeidx_varsheaderrK  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerr  r   	outer_bufr1  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                       r8   codegen_kernelMetalKernel.codegen_kernelg  s   77NN9%NN34**,[[]77&&"llFNN%9&#EF + FJ\\EQ6*6(#6\   #1(^**11889DEL#
 /$$'+yy 1%)%5%51 ($ ""6FF ,.G.GH22 !
 :;K:LCP NN:;$(II$<$<$B$B$DLE 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	 %E
 %)II$;$;$A$A$CLEGG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE %D "YY//668ENN_UG1#=> 9  (G!'--??8H'OP	  ( 8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZO T NN5!x=1$$-h$7S#CHH:^Cc	N;K1M %8 D../DII&  NN3c f 77NN9% }} NN6"}}g2 V S ]sR   =AWV*$DW)JV/6)WB	W(W*W/
V>	9W
W	W
W c           	        [         R                  R                  nU R                  R                   H  nUR                  U5        M     U R                  R                  5       u  pgph[        Xx5       V	V
s0 s H  u  p[        U	5      U
_M     nn	n
/ U R                  R                  R                  5       QU R                  R                  R                  5       QnU Vs/ s H  oU R                  ;  d  M  UPM     nnXR                  R                   Vs/ s H  n[        U5      PM     sn-  nU Vs/ s H  oU   PM	     nnU R                   H  n[        UR                  [         R"                  [$        45      (       a  M4  [        UR                  [         R&                  5      (       a  UR                  nO3[         R                  R                  R)                  X5      R*                  nUR,                  (       a  U R.                  (       d  M  UR1                  [        U5      5        UR1                  [$        5        M     [         R                  R2                  (       a  U R4                  OU R6                  nSS jn[9        U R;                  5       5      S:  a  U R;                  5        Vs/ s HQ  nU" UR,                  (       a+  [         R<                  " UR                  U R>                  5      OUR                  5      PMS     nnUR1                  U" US5      5        UR1                  [@        5        O*[         R                  R2                  (       a  [C        S5      eU R.                  (       a  U R;                  5        Vs/ s HG  nUR,                  (       a1  U" [         R<                  " UR                  U R>                  5      5      OSPMI     nnUR1                  U" US5      5        UR1                  [@        5        O6[         R                  R2                  (       a  US/-  nUR1                  S5        URE                  UU[F        RH                  " S5      S	US
9  gs  sn
n	f s  snf s  snf s  snf s  snf s  snf )z 
Codegens a call to this kernel
threadsc                    [         R                  R                  (       a+  U  Vs/ s H	  nSU S3PM     n nSSR                  U 5       S3$ U SSR                  U 5       S3$ s  snf )Nzstatic_cast<uint64_t>(rA   {r@   rO  z=[r  )r   r  rc  join)r  kwargr  s      r8   format_threads/MetalKernel.call_kernel.<locals>.format_threads  si    ww""BIJ'Q3A3a8'JDIIg./r22499W#5"6a88 Ks   A"r   zWe should always have threads?1
group_sizeNr  F)devicetriton	arg_types)r  z	list[str]r  r6   r   r6   )%r   r  wrapper_coderE   ro  ensure_size_computedpython_argdefszipr6   rh  keysrk  rj  r  r3   r  r  r  r(   r  generate_numel_exprrz  r  r  rA  rc  cexprpexprrX   rd  r  r  listrY   generate_kernel_callr4   r  )rH   re  nodedeallocate_wswrapperrR  _	call_argsr  call_argarg_typearg_name_to_typerE   r   treerI   expr_printerr  r  s                      r8   call_kernelMetalKernel.call_kernel  sw    ''&&##A((+ $ &*YY%=%=%?"a>A)>W
>W(:CM8#>W 	 
 S))..0R4993J3J3O3O3QR#Gt$2F2F'FtG!3!34!3AQ!3446:;dsc*d	; $$D$**u}}c&:;;DJJ55zzww++??KQQ$$(=(=(=CI&  % % &'WW%8%8tzzdjj	9 t&&()A- 002 3A ~~ IIaggt'@'@A
 3   KKw	:;T"ww"""#CDD  
 002	 3A >> UYYqww0I0IJK 3	   KKw=>T"ww""   &$$<<& 	% 	
E

 H4;8 s,   4QQ$4Q$Q)2Q.-AQ3AQ8c                (   U(       d  U(       d  g U R                  U5      nU(       a  U S3OSnU(       a  U SU R                  U5       3OSnU(       a  U(       a
  SU SU S3nOSU U S3nU R                  R                  U R                  US	S
9  g )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rH   rI   sizelowerupperexpr_str
lower_expr
upper_exprr  s	            r8   check_boundsMetalKernel.check_bounds"  s      $$T**/z&R
BGzT%6%6t%<$=>R
U:,fZL	BD*j\:D$,,?r:   )r  )r  zdict[str, sympy.Expr]r  r   r   r  )r   r  r   r6   )re  r6   r  r   r   r   r   )
re  r6   r  r   r  r   r  r!   r   r  )re  r6   r  r   r  r   r   r  )r   zUnion[str | torch.dtype]r  zOptional[int]r  zOptional[Any]r  r%   r   zValueRanges[Any]r   r   )
r   r  r   r  r  r    r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r  )r<  r   r   r  r  )re  zOptional[str]r   r6   r  )re  r6   r  r   r  r%   r   r  )
rI   r   r  r   r  r%   r  r%   r   r  )(r   r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rF   r  r   r  r<   r  kexprr
   rK  __annotations__r  r  r  r  r  r  r   unknownr  r  r  rL  rX  r  r  r  r   __classcell__r  s   @r8   r  r    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*	8 %)'+##.#6#6#8' " %	
  ! 
,  &	
 ; 
5 s2s2 s2 &	s2
 ;s2 
5s2jHT#JcL BFT
T
"T
:>T
	T
l@@&0@9=@FJ@	@ @r:   r  c                  J   ^  \ rS rSr\rSU 4S jjr        SS jrSrU =r	$ )MetalSchedulingi4  c                   > [         TU ]  U5        [        R                  R                  nUb<  [        R                  R
                  (       d  UR                  R                  S5        g g g )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)r  r  r   r  r  rc  ru  r  )rH   	schedulerr  r  s      r8   r  MetalScheduling.__init__7  sQ    #''&&77&&%%Z ' r:   c                \   [         R                  R                  nXR                  ;   a  UR                  U   nU$ SUR	                  5        3nU nXTR                  U'   [         R                  R
                  (       a	  SU S3U-   n[        X$5      u  pxU SU 3n	UR                  XaU	SS9  U$ )Nmps_lib_zconst char* z
_source = 
F)gpu)r   r  r  src_to_kernelnext_kernel_suffixrc  r   define_kernel)
rH   src_codenode_scheduler   r  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r8   r  MetalScheduling.define_kernel@  s     ''&&,,,!//9K"  &g&@&@&B%CDL)NK.9!!(+ww""),zBXM(;M(S%G")"-=,>?!!,:JPU!Vr:   r   )r  zOptional[Scheduler]r   r  )r  r6   r  zlist[SchedulerNode]r   r  r   r6   )
r   r   r   r   r  kernel_typer  r  r   r  r  s   @r8   r  r  4  s7    K,?IT	 r:   r  )r7   z)Union[float, int, bool, str, CSEVariable]r   r6   )I
__future__r   r  r  loggingrg  pathlibr   typingr   r   r   r  sympy.printing.precedencer   r4   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   r  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   r  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   r  r   r9   r<   r   _initialize_pointwise_overridesr  r  r  r   r:   r8   <module>r     s*   #      / /  0  9 / O 7 G G , ,  C B 64! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
h"| h"V][ ]@	  . .u 5  & & (\	@* \	@~"n "r:   