
    ȅiy=                      S SK Jr  S SKrS SKrS SKrS SKJrJrJrJ	r	  S SK
r
S SKrS SKJr  S SKJr  S SKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'   " S S\$5      r(\(" 5       RR                  r*\(       a  S SK+J,r,J-r-  SSKJ.r.  SSK/J0r0  SSK1J2r2  Sr3Sr4S%S jr5\Rl                  Ro                  \8S5      r9 " S S5      r: " S S\;5      r< " S S \#5      r= " S! S"\&5      r> " S# S$\'5      r?g)&    )annotationsN)AnyOptionalTYPE_CHECKINGUnion)
OrderedSet)has_tpu_pallas)ModularIndexing   )config)ComputedBuffertorch_dtype_to_jax)get_fused_kernel_nameget_kernel_metadata)V   )BlockPatternMatcher)BackendFeatureCSEVariableIndentedBufferOpOverridesPythonPrinter)
SIMDKernelSIMDSchedulingc                  6    \ rS rSrSrSS jrSS jrSS jrSrg)	PallasPrinter   zG
Custom sympy printer for Pallas that handles JAX-specific constructs.
c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )z!Convert sympy Where to jnp.where.r   r   r   
jnp.where(, ))doprintargs)selfexprcpqs        X/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/pallas.py_print_WherePallasPrinter._print_Where$   s_    LL1&LL1&LL1&A3b2aS**    c                    UR                    Vs/ s H  o R                  U5      PM     nnUS   nUSS  H  nSU SU S3nM     U$ s  snf )z7Convert sympy Min to jnp.minimum for JAX compatibility.r   r   Njnp.minimum(r!   r"   r$   r#   r%   r&   argr$   results        r*   
_print_MinPallasPrinter._print_Min+   X    -1YY7YcS!Y7a8C#F82cU!4F 	 8   Ac                    UR                    Vs/ s H  o R                  U5      PM     nnUS   nUSS  H  nSU SU S3nM     U$ s  snf )z7Convert sympy Max to jnp.maximum for JAX compatibility.r   r   Njnp.maximum(r!   r"   r0   r1   s        r*   
_print_MaxPallasPrinter._print_Max3   r6   r7    N)r&   
sympy.Exprreturnstr)	__name__
__module____qualname____firstlineno____doc__r+   r4   r:   __static_attributes__r<   r-   r*   r   r      s    +r-   r   )CallableSequence)IRNode)ReductionType)BaseSchedulerNodemain   c                6    U [         -   S-
  [         -  [         -  $ )z@Align size to WARPGROUP_SIZE (128) for Mosaic GPU compatibility.r   )WARPGROUP_SIZE)sizes    r*   _align_to_warpgrouprP   Q   s    N"Q&>9^KKr-   kernel_codec                  <    \ rS rSrSr S   S	S jjrSS.S jrSrg)
PallasKernelWrapperZ   z6Wrapper to provide .run() interface for Pallas kernelsNc                H    Xl         X l        [        R                  SU5        g )NzPallas kernel path: %s)	kernel_fnkernel_pathkernel_code_loginfo)r%   rV   rW   s      r*   __init__PallasKernelWrapper.__init__]   s      #&5{Cr-   )streamc               *    U R                   " USU0UD6$ )z
Execute the Pallas kernel.

Args:
    *args: Arguments to pass to the kernel function
    stream: CUDA stream to pass to the kernel function
    **kwargs: Additional keyword arguments for the kernel

Returns:
    Result of the kernel execution
r\   )rV   )r%   r\   r$   kwargss       r*   runPallasKernelWrapper.rund   s     ~~t=F=f==r-   )rV   rW   N)rV   zCallable[..., Any]rW   Optional[str])r@   rA   rB   rC   rD   rZ   r_   rE   r<   r-   r*   rS   rS   Z   s3    @ KOD+D:GD !% > >r-   rS   c                      \ rS rSrSrSrg)Unsupporteds   zJException raised when an operation is not supported by the Pallas backend.r<   N)r@   rA   rB   rC   rD   rE   r<   r-   r*   rd   rd   s   s    Tr-   rd   c                  V	   \ rS rSrSr\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r	\SzS j5       r
\SzS j5       r\SzS	 j5       r\SzS
 j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\SzS j5       r\S{S j5       r \S{S j5       r!\S{S j5       r"\S|S  j5       r#\S}S! j5       r$\  S~         SS# jj5       r%\SS$ j5       r&\SS% j5       r'\SS& j5       r(\SzS' j5       r)\SzS( j5       r*\SzS) j5       r+\SzS* j5       r,\SzS+ j5       r-\SzS, j5       r.\S{S- j5       r/\S{S. j5       r0\S{S/ j5       r1\S{S0 j5       r2\S{S1 j5       r3\SzS2 j5       r4\SzS3 j5       r5\SzS4 j5       r6\S{S5 j5       r7\S{S6 j5       r8\S{S7 j5       r9\SzS8 j5       r:\S{S9 j5       r;\S{S: j5       r<\S{S; j5       r=\S{S< j5       r>\S{S= j5       r?\S{S> j5       r@\S{S? j5       rA\SS@ j5       rB\SSA j5       rC\SzSB j5       rD\SzSC j5       rE\SzSD j5       rF\SzSE j5       rG\SzSF j5       rH\SzSG j5       rI\SzSH j5       rJ\SzSI j5       rK\SzSJ j5       rL\SzSK j5       rM\SzSL j5       rN\SzSM j5       rO\SzSN j5       rP\SzSO j5       rQ\SzSP j5       rR\SzSQ j5       rS\SSR j5       rT\SSS j5       rU\SST j5       rV\SSU j5       rW\SSV j5       rX\SzSW j5       rY\SSX j5       rZ\SSY j5       r[\SSZ j5       r\\SS[ j5       r]\SS\ j5       r^\SS] j5       r_\SS^ j5       r`\SS_ j5       ra\SS` j5       rb\SSa j5       rc\SSb j5       rd\SSc j5       re\SSd j5       rf\SSe j5       rg\SSf j5       rh\SzSg j5       ri\SzSh j5       rj\SSi j5       rk\S{Sj j5       rl\S{Sk j5       rm\S{Sl j5       rn\SzSm j5       ro\SzSn j5       rp\S{So j5       rq\S{Sp j5       rr\S{Sq j5       rs\SzSr j5       rt\S{Ss j5       ru\S{St j5       rv\SSu j5       rw\SSv j5       rx\SSw j5       ry\SSx j5       rzSyr{g")PallasKernelOverridesw   z
Map element-wise ops to JAX/Pallas operations.

For now, we use the default Python operators which are compatible
with JAX numpy broadcasting semantics.
c                    SU  S3$ )Nzjnp.sin(r"   r<   xs    r*   sinPallasKernelOverrides.sin       !Ar-   c                    SU  S3$ )Nzjnp.cos(r"   r<   rj   s    r*   cosPallasKernelOverrides.cos   rn   r-   c                    SU  S3$ )Nzjnp.tan(r"   r<   rj   s    r*   tanPallasKernelOverrides.tan   rn   r-   c                    SU  S3$ )Nz	jnp.sinh(r"   r<   rj   s    r*   sinhPallasKernelOverrides.sinh       1#Qr-   c                    SU  S3$ )Nz	jnp.cosh(r"   r<   rj   s    r*   coshPallasKernelOverrides.cosh   rx   r-   c                    SU  S3$ )Nz	jnp.tanh(r"   r<   rj   s    r*   tanhPallasKernelOverrides.tanh   rx   r-   c                    SU  S3$ )Nzjnp.arcsin(r"   r<   rj   s    r*   asinPallasKernelOverrides.asin       QCq!!r-   c                    SU  S3$ )Nzjnp.arccos(r"   r<   rj   s    r*   acosPallasKernelOverrides.acos   r   r-   c                    SU  S3$ )Nzjnp.arctan(r"   r<   rj   s    r*   atanPallasKernelOverrides.atan   r   r-   c                    SU  S3$ )Nzjnp.exp(r"   r<   rj   s    r*   expPallasKernelOverrides.exp   rn   r-   c                    SU  S3$ )Nz	jnp.exp2(r"   r<   rj   s    r*   exp2PallasKernelOverrides.exp2   rx   r-   c                    SU  S3$ )Nz
jnp.expm1(r"   r<   rj   s    r*   expm1PallasKernelOverrides.expm1       A3a  r-   c                    SU  S3$ )Nzjnp.log(r"   r<   rj   s    r*   logPallasKernelOverrides.log   rn   r-   c                    SU  S3$ )Nz
jnp.log10(r"   r<   rj   s    r*   log10PallasKernelOverrides.log10   r   r-   c                    SU  S3$ )Nz	jnp.log2(r"   r<   rj   s    r*   log2PallasKernelOverrides.log2   rx   r-   c                    SU  S3$ )Nz
jnp.log1p(r"   r<   rj   s    r*   log1pPallasKernelOverrides.log1p   r   r-   c                    SU  S3$ )Nz	jnp.sqrt(r"   r<   rj   s    r*   sqrtPallasKernelOverrides.sqrt   rx   r-   c                    SU  S3$ )Nz(1.0 / jnp.sqrt())r<   rj   s    r*   rsqrtPallasKernelOverrides.rsqrt   s    !!B''r-   c                    SU  S3$ )Nzjnp.abs(r"   r<   rj   s    r*   absPallasKernelOverrides.abs   rn   r-   c                    SU  S3$ )Nz(-r"   r<   rj   s    r*   negPallasKernelOverrides.neg   s    A3ayr-   c                    SU  S3$ )Nz
jnp.floor(r"   r<   rj   s    r*   floorPallasKernelOverrides.floor   r   r-   c                    SU  S3$ )Nz	jnp.ceil(r"   r<   rj   s    r*   ceilPallasKernelOverrides.ceil   rx   r-   c                    SU  S3$ )Nz
jnp.trunc(r"   r<   rj   s    r*   truncPallasKernelOverrides.trunc   r   r-   c                    SU  S3$ )Nz
jnp.round(r"   r<   rj   s    r*   roundPallasKernelOverrides.round   r   r-   c                    SU  S3$ )Nz(1.0 / (1.0 + jnp.exp(-)))r<   rj   s    r*   sigmoidPallasKernelOverrides.sigmoid   s    (3//r-   c                    SU  S3$ )Nr9   z, 0)r<   rj   s    r*   reluPallasKernelOverrides.relu   s    aS%%r-   c                    SU  SU S3$ )Nz
jnp.power(r!   r"   r<   abs     r*   powPallasKernelOverrides.pow       A3b1%%r-   c                    SU  SU S3$ )Nr9   r!   r"   r<   r   s     r*   maximumPallasKernelOverrides.maximum       aS1#Q''r-   c                    SU  SU S3$ )Nr/   r!   r"   r<   r   s     r*   minimumPallasKernelOverrides.minimum   r   r-   c                    SU  SU SU S3$ )Nr    r!   r"   r<   )condr   r   s      r*   wherePallasKernelOverrides.where   s    D6A3b1--r-   c                   U" 5       n[        U[        5      (       aP  [        R                  " U5      (       a  SnO=[        R                  " U5      (       a  US:  a  SOSnO[        U5      nO[        U5      nSU  SU SU S3$ )zr
Computes body, but only uses the result where mask is true.
Where mask is false, uses the 'other' value instead.
jnp.nanr   jnp.inf-jnp.infr    r!   r"   )
isinstancefloatmathisnanisinfrepr)maskbodyotherr3   	other_strs        r*   maskedPallasKernelOverrides.masked   su     eU##zz%  %	E"").I
	 K	UID6F82i[::r-   Nc                *    [        U5      nSU  SU S3$ )Nzjnp.asarray(	).astype(r"   r   )rk   dtype	src_dtypeuse_compute_types	jax_dtypes        r*   to_dtypePallasKernelOverrides.to_dtype  s#     'u-	aS	)A66r-   c                F    [        U5      n[        U5      nSU  SU SU S3$ )z=Bitcast a value from one dtype to another with the same size.z)jax.lax.bitcast_convert_type(jnp.asarray(r   ), r"   r   )rk   r   r   r   jax_src_dtypes        r*   to_dtype_bitcast&PallasKernelOverrides.to_dtype_bitcast  s7     'u-	*95:1#Y}oUXYbXccdeer-   c                ~   SSK Jn  [        R                  R	                  U 5      n[        R                  R                  U5      n[        R                  R                  U5      n[        R                  R                  R                  [        R                  R                  XR" U 5      S9n[        R                  Xa5      $ )z>Convert a sympy expression to a JAX array indexing expression.r   )get_bounds_index_expr)bounds)utilsr   r   kernelprepare_indexingrename_indexingkexprcsegeneratecomputerg   r   )r&   r   r   preparedrenamedidx_strvars          r*   
index_expr PallasKernelOverrides.index_expr  s     	2 88,,T2((**84((..)hhll##HHg.CD.I $ 
 %--c99r-   c                   [        U5      nU[        R                  :X  a  U (       a  S$ S$ [        U [        5      (       aA  [
        R                  " U 5      (       a  g[
        R                  " U 5      (       a
  U S:  a  S$ S$ SU  SU S	3$ )
z/Convert a constant value to JAX representation.TrueFalser   r   r   r   z
jnp.array(z, dtype=r"   )r   torchboolr   r   r   r   r   )valr   r   s      r*   constantPallasKernelOverrides.constant,  sw     'u-	EJJ 6-g-c5!!zz# zz#$'!Gy;;C5155r-   c                    SU  S3$ )Nz	jnp.real(r"   r<   rj   s    r*   realPallasKernelOverrides.real:  rx   r-   c                    SU  S3$ )Nz	jnp.imag(r"   r<   rj   s    r*   imagPallasKernelOverrides.imag>  rx   r-   c                    SU  S3$ )Nz	jnp.conj(r"   r<   rj   s    r*   conjPallasKernelOverrides.conjB  rx   r-   c                    SU  S3$ )Nz
jnp.angle(r"   r<   rj   s    r*   anglePallasKernelOverrides.angleF  r   r-   c                    SU  SU  S3$ )z8View complex tensor as real tensor with extra dimension.zjnp.stack([jnp.real(z), jnp.imag(z)], axis=-1)r<   rj   s    r*   view_as_real"PallasKernelOverrides.view_as_realJ  s     &aSQC|DDr-   c                    SU  SU  S3$ )z#View real tensor as complex tensor.(z[..., 0] + 1j * z	[..., 1])r<   rj   s    r*   view_as_complex%PallasKernelOverrides.view_as_complexO  s     1#%aS	22r-   c                    SU  SU S3$ )Nr  z == r"   r<   r   s     r*   eqPallasKernelOverrides.eqU      1#T!Ar-   c                    SU  SU S3$ )Nr  z != r"   r<   r   s     r*   nePallasKernelOverrides.neY  r  r-   c                    SU  SU S3$ )Nr  z < r"   r<   r   s     r*   ltPallasKernelOverrides.lt]      1#S1~r-   c                    SU  SU S3$ )Nr  z <= r"   r<   r   s     r*   lePallasKernelOverrides.lea  r  r-   c                    SU  SU S3$ )Nr  z > r"   r<   r   s     r*   gtPallasKernelOverrides.gte  r$  r-   c                    SU  S3$ )Nz
jnp.isnan(r"   r<   rj   s    r*   r   PallasKernelOverrides.isnani  r   r-   c                    SU  S3$ )Nz
jnp.isinf(r"   r<   rj   s    r*   r   PallasKernelOverrides.isinfm  r   r-   c                    SU  S3$ )Nzjnp.isfinite(r"   r<   rj   s    r*   isfinitePallasKernelOverrides.isfiniteq  s    qc##r-   c                    SU  SU S3$ )Nr  z >= r"   r<   r   s     r*   gePallasKernelOverrides.geu  r  r-   c                    SU  SU S3$ )Nzjnp.logical_and(r!   r"   r<   r   s     r*   logical_and!PallasKernelOverrides.logical_andz      !!Bqc++r-   c                    SU  SU S3$ )Nzjnp.logical_or(r!   r"   r<   r   s     r*   
logical_or PallasKernelOverrides.logical_or~       2aS**r-   c                    SU  S3$ )Nzjnp.logical_not(r"   r<   rj   s    r*   logical_not!PallasKernelOverrides.logical_not      !!A&&r-   c                    SU  SU S3$ )Nzjnp.logical_xor(r!   r"   r<   r   s     r*   logical_xor!PallasKernelOverrides.logical_xor  r8  r-   c                    SU  SU S3$ )Nzjnp.arctan2(r!   r"   r<   r   s     r*   atan2PallasKernelOverrides.atan2  r   r-   c                    SU  SU S3$ )Nz
jnp.hypot(r!   r"   r<   r   s     r*   hypotPallasKernelOverrides.hypot  r   r-   c                    SU  SU S3$ )Nz	jnp.fmod(r!   r"   r<   r   s     r*   fmodPallasKernelOverrides.fmod  s    1#Rs!$$r-   c                    SU  SU S3$ )Nzjnp.remainder(r!   r"   r<   r   s     r*   	remainderPallasKernelOverrides.remainder      s"QCq))r-   c                &    SU  SU SU  SU SU  S3$ )Nz
(jnp.sign(z) * jnp.sign(z) * (jnp.abs(z) // jnp.abs(z))).astype(.dtype)r<   r   s     r*   truncdivPallasKernelOverrides.truncdiv  s.     A3mA3mA3mA3kZ[Y\\cddr-   c                    SU  SU S3$ )Nr  z // r"   r<   r   s     r*   floordivPallasKernelOverrides.floordiv  r  r-   c                    SU  SU SU S3$ Nz	jnp.clip(r!   r"   r<   rk   min_valmax_vals      r*   clampPallasKernelOverrides.clamp      1#Ry7)155r-   c                    SU  SU SU S3$ rY  r<   rZ  s      r*   clipPallasKernelOverrides.clip  r_  r-   c                    SU  S3$ )Nz	jnp.sign(r"   r<   rj   s    r*   signPallasKernelOverrides.sign  rx   r-   c                    SU  S3$ )Nzjnp.signbit(r"   r<   rj   s    r*   signbitPallasKernelOverrides.signbit  s    aS""r-   c                    SU  S3$ )Nzjax.scipy.special.erf(r"   r<   rj   s    r*   erfPallasKernelOverrides.erf  s    's!,,r-   c                    SU  S3$ )Nzjax.scipy.special.erfc(r"   r<   rj   s    r*   erfcPallasKernelOverrides.erfc  s    (1--r-   c                    SU  S3$ )Nzjax.scipy.special.erfinv(r"   r<   rj   s    r*   erfinvPallasKernelOverrides.erfinv  s    *1#Q//r-   c                    SU  S3$ )Nzjax.scipy.special.gammaln(r"   r<   rj   s    r*   lgammaPallasKernelOverrides.lgamma      +A3a00r-   c                    SU  S3$ )Nzjax.scipy.special.digamma(r"   r<   rj   s    r*   digammaPallasKernelOverrides.digamma  ru  r-   c                    SU  SU  SU  S3$ )Nr    z>.astype(jnp.float64) == 0.0, 1.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=0)[0]).astype(rR  r<   rj   s    r*   	bessel_j0PallasKernelOverrides.bessel_j0  +      ++,# .c"	
r-   c                    SU  SU  SU  S3$ )Nr    z>.astype(jnp.float64) == 0.0, 0.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=1)[1]).astype(rR  r<   rj   s    r*   	bessel_j1PallasKernelOverrides.bessel_j1  r|  r-   c                    SU  SU  S3$ Njax.lax.bessel_i0e() * jnp.exp(jnp.abs(r   r<   rj   s    r*   modified_bessel_i0(PallasKernelOverrides.modified_bessel_i0       %QC';A3bAAr-   c                    SU  SU  S3$ Njax.lax.bessel_i1e(r  r   r<   rj   s    r*   modified_bessel_i1(PallasKernelOverrides.modified_bessel_i1  r  r-   c                    SU  SU  SU  S3$ )Nr    z == 0.0, 1.0, jnp.sin(z) / r"   r<   rj   s    r*   spherical_bessel_j0)PallasKernelOverrides.spherical_bessel_j0  s      A34QCtA3a@@r-   c                    SU  SU  S3$ r  r<   rj   s    r*   i0PallasKernelOverrides.i0       %QC';A3bAAr-   c                    SU  S3$ )Nr  r"   r<   rj   s    r*   i0ePallasKernelOverrides.i0e       %QCq))r-   c                    SU  SU  S3$ r  r<   rj   s    r*   i1PallasKernelOverrides.i1  r  r-   c                    SU  S3$ )Nr  r"   r<   rj   s    r*   i1ePallasKernelOverrides.i1e  r  r-   c                    SU  SU S3$ Nzjax.scipy.special.gammainc(r!   r"   r<   rk   ys     r*   gammaincPallasKernelOverrides.gammainc	  s     -QCr!A66r-   c                    SU  SU S3$ Nzjax.scipy.special.gammaincc(r!   r"   r<   r  s     r*   	gammainccPallasKernelOverrides.gammaincc       .aS1#Q77r-   c                    SU  SU S3$ r  r<   r  s     r*   igammaPallasKernelOverrides.igamma  s     -QCr!A66r-   c                    SU  SU S3$ r  r<   r  s     r*   igammacPallasKernelOverrides.igammac  r  r-   c                    SU  SU S3$ )Nzjax.scipy.special.polygamma(z.astype(jnp.int32), r"   r<   r  s     r*   	polygammaPallasKernelOverrides.polygamma  s     .aS0DQCqIIr-   c                    SU  S3$ )Nzjax.scipy.special.ndtri(r"   r<   rj   s    r*   ndtriPallasKernelOverrides.ndtri$  s     *!A..r-   c                    SU  SU S3$ )Nzjax.scipy.special.zeta(r!   r"   r<   r  s     r*   zetaPallasKernelOverrides.zeta)  s     )2aS22r-   c                    SU  SU S3$ )Nzjax.scipy.special.xlogy(r!   r"   r<   r  s     r*   xlogyPallasKernelOverrides.xlogy.  s     *!Bqc33r-   c                    SU  SU S3$ )Nzjax.scipy.special.xlog1py(r!   r"   r<   r  s     r*   xlog1pyPallasKernelOverrides.xlog1py3  s     ,A3b155r-   c                >    SU  SU SU  SU  SU SU  SU SU S	U  S
3$ )Njnp.where(jnp.abs() <= 1, jnp.cos( * jnp.arccos(jnp.clip(, -1, 1))), jnp.where( > 1, jnp.cosh( * jnp.arccosh(jnp.maximum(, 1.0))), ((-1.0) ** ) * jnp.cosh( * jnp.arccosh(jnp.maximum(-
, 1.0)))))r<   rk   ns     r*   chebyshev_polynomial_t,PallasKernelOverrides.chebyshev_polynomial_t8  s[     ! $c0 4 s5aS 9M!,H:	W	
r-   c                    SR                  / SPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PS	PU PS
PU  PSPU  PSPU  PSPU PSPU PSPU PSPU PSPU  PSPU  PSP5      $ )N r  ) < 1, jnp.sin(( + 1) * jnp.arccos(jnp.clip(z&, -1, 1))) / jnp.sqrt(jnp.maximum(1 - z**2, 1e-10)), jnp.where( >= 1, jnp.where( == 1,  + 1.0, jnp.sinh((  + 1) * jnp.arccosh(jnp.maximum( , 1.0))) / jnp.sqrt(jnp.maximum(**2 - 1, 1e-10))), jnp.where( == -1, ((-1.0) ** ) * ( + 1.0), ((-1.0) ** ) * jnp.sinh((! + 1) * jnp.arccosh(jnp.maximum(-**2 - 1, 1e-10)))))joinr  s     r*   chebyshev_polynomial_u,PallasKernelOverrides.chebyshev_polynomial_uF  s   	; 	;  	; 	; $ 	;s	;6	;78c	;:(	;()s	;+	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r-   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PSPU PS	PU  PS
PU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r     == 0, jnp.ones_like(), jnp.where(	 == 1, 2* - 1, jnp.where(	 == 2, 4***2 - 2*	 == 3, 8***3 - 4***2 - 4* + 1, jnp.where(
 == 4, 16***4 - 8*	**3 - 12***2 + 4*
 == 5, 32*	**5 - 16*	**4 - 32*	**3 + 12***2 + 6* - 1, jnp.zeros_like()))))))r  r  s     r*   chebyshev_polynomial_v,PallasKernelOverrides.chebyshev_polynomial_v[  
   ) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r-   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PS	PU PS
PU  PSPU  PSPU  PS	PU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    r  r  r  r  r  **2 + 2*r  r  **3 + 4*r  r  **4 + 8*r  r  	**5 + 16*r  r   + 1, jnp.zeros_like(r  r  r  s     r*   chebyshev_polynomial_w,PallasKernelOverrides.chebyshev_polynomial_wk  r  r-   c                J    SU  S3nSU SU SU SU SU SU S	U S
U SU S3$ )N(2 *  - 1)r  r  r  r  r  r  r  r  r  r  r<   rk   r  r  s      r*   shifted_chebyshev_polynomial_t4PallasKernelOverrides.shifted_chebyshev_polynomial_t{  si    
 A3e  $c0 4 s5aS 9M!,H:	W	
r-   c                    SU  S3nSR                  / SPU PSPU PSPU PSPU PSPU PS	PU PS
PU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSP5      $ )Nr  r  r  r  r  r  z', -1, 1))) / jnp.sqrt(jnp.maximum(1 - (z)**2, 1e-10)), jnp.where(r  r  r  r  r  r  r  r  r  r  r  r  r  r  s      r*   shifted_chebyshev_polynomial_u4PallasKernelOverrides.shifted_chebyshev_polynomial_u  s    A3e	; 	;  	; 	; $ 	;s	;6	;78c	;:)	;)*	;,	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r-   c                   SU  S3nSR                  / SPU PSPU  PSPU PSPU PSPU PS	PU PS
PU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU  PSP5      $ )Nr  r  r  r    r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  s      r*   shifted_chebyshev_polynomial_v4PallasKernelOverrides.shifted_chebyshev_polynomial_v      A3e) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r-   c                   SU  S3nSR                  / SPU PSPU  PSPU PSPU PSPU PS	PU PS
PU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU PSPU  PSP5      $ )Nr  r  r  r    r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  s      r*   shifted_chebyshev_polynomial_w4PallasKernelOverrides.shifted_chebyshev_polynomial_w  r  r-   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    r  r  z == 1, 2 * , jnp.where(z == 2, 4 * z**2 - 2, jnp.where(z == 3, 8 * z**3 - 12 * z == 4, 16 * z**4 - 48 * z**2 + 12, jnp.where(z == 5, 32 * z**5 - 160 * z**3 + 120 * , jnp.zeros_like(r  r  r  s     r*   hermite_polynomial_h*PallasKernelOverrides.hermite_polynomial_h  s~   ) )j ) )0 ) ) 4 ))&)'(c)*))&)'(c)*) ) ') ()c) *5) 67C)8) 	) (	) )*s	) +6	) 78S	)9	)
 )
 ()
 )*s)
 +7)
 89c)
 :F)
 GHS)
I)  S) !()	
r-   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    r  r  r  r  z == 2, z**2 - 1, jnp.where(z == 3, 
**3 - 3 * z == 4, z
**4 - 6 * z**2 + 3, jnp.where(z == 5, z**5 - 10 * **3 + 15 * r  r  r  r  s     r*   hermite_polynomial_he+PallasKernelOverrides.hermite_polynomial_he  s}   
) )j ) )0 ) ) 4 ))")#$#)&))")#$#)&) ) #) $%#) &0) 12s)3) 	) #	) $%#	) &0	) 12s	)3	)
 )
 #)
 $%#)
 &1)
 23)
 4?)
 @Ac)
B)  S) !()	
r-   c                   SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU  PS	PU PS
PU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    r  r  z == 1, 1 - r  z == 2, (r  z + 2) / 2, jnp.where(z	 == 3, (-z**3 + 9*z	**2 - 18*z + 6) / 6, jnp.where(z == 4, (z	**4 - 16*z	**3 + 72*z	**2 - 96*z + 24) / 24, jnp.where(z	 == 5, (-z	**5 + 25*z
**4 - 200*z
**3 + 600*z
**2 - 600*z + 120) / 120, jnp.zeros_like(r  r  r  s     r*   laguerre_polynomial_l+PallasKernelOverrides.laguerre_polynomial_l  s	   
) )j ) )0 ) ) 4 ))&)'(c)*))#)$%3)&.)/0c)2) ) %) &'C) (0) 12s) 3<) =>3)?) 	) $	) %&3	) '0	) 12s	) 3<	) =>3	) ?H	) IJs	)K	)
 )
 %)
 &'C)
 (1)
 23)
 4>)
 ?@S)
 AK)
 LM#)
 NX)
 YZWZ)
[)  S) !()	
r-   c                    SR                  / SPU PSPU  PSPU PSPU  PSPU PSPU  PSPU PS	PU  PS
PU  PSPU PSPU  PSPU  PSPU PSPU  PSPU  PSPU  PSPU  PSP5      $ )Nr  r    r  r  r  r  z == 2, (3 * z**2 - 1) / 2, jnp.where(z == 3, (5 * r  z) / 2, jnp.where(z == 4, (35 * z**4 - 30 * z**2 + 3) / 8, jnp.where(z == 5, (63 * z**5 - 70 * r  z) / 8, jnp.zeros_like(r  r  r  s     r*   legendre_polynomial_p+PallasKernelOverrides.legendre_polynomial_p  s~   
) )j ) )0 ) ) 4 ))")#$#)&))')()s)+) ) () )*s) +5) 67C)8) 	) )	) *+	) ,7	) 89c	):	)
 )
 ))
 *+)
 ,7)
 89c)
 :E)
 FGC)
H)  S) !()	
r-   c                    SU  S3$ )Nzjnp.reciprocal(r"   r<   rj   s    r*   
reciprocal PallasKernelOverrides.reciprocal  s     1%%r-   c                    SU  S3$ )Nzjnp.square(r"   r<   rj   s    r*   squarePallasKernelOverrides.square  r   r-   c                    SU  SU SU S3$ )zFused multiply-add: a * b + c

JAX doesn't have jnp.fma, so we use the unfused version.
The compiler may still fuse this on supported hardware.
z((r  z) + (r   r<   )r   r   r'   s      r*   fmaPallasKernelOverrides.fma  s     A3eA3eA3b))r-   c                    SU  SU S3$ )Nzjnp.copysign(r!   r"   r<   r   s     r*   copysignPallasKernelOverrides.copysign  s    qcA3a((r-   c                    SU  SU S3$ )Nzjnp.nextafter(r!   r"   r<   r   s     r*   	nextafterPallasKernelOverrides.nextafter
  rP  r-   c                    SU  SU S3$ )Nz
jnp.ldexp(r!   r"   r<   r   s     r*   ldexpPallasKernelOverrides.ldexp  r   r-   c                    SU  S3$ )Nz
jnp.frexp(r"   r<   rj   s    r*   frexpPallasKernelOverrides.frexp  r   r-   c                    SU  S3$ )Nz	jnp.modf(r"   r<   rj   s    r*   modfPallasKernelOverrides.modf  rx   r-   c                    SU  SU S3$ )Nzjnp.bitwise_and(r!   r"   r<   r   s     r*   bitwise_and!PallasKernelOverrides.bitwise_and  r8  r-   c                    SU  SU S3$ )Nzjnp.bitwise_or(r!   r"   r<   r   s     r*   
bitwise_or PallasKernelOverrides.bitwise_or  r<  r-   c                    SU  SU S3$ )Nzjnp.bitwise_xor(r!   r"   r<   r   s     r*   bitwise_xor!PallasKernelOverrides.bitwise_xor#  r8  r-   c                    SU  S3$ )Nzjnp.bitwise_not(r"   r<   rj   s    r*   bitwise_not!PallasKernelOverrides.bitwise_not'  r@  r-   c                    SU  SU S3$ )Nzjnp.left_shift(r!   r"   r<   r   s     r*   
left_shift PallasKernelOverrides.left_shift+  r<  r-   c                    SU  SU S3$ )Nzjnp.right_shift(r!   r"   r<   r   s     r*   right_shift!PallasKernelOverrides.right_shift/  r8  r-   c                    [         R                  R                  R                  SU5      nS[         R                  R                  R	                  U 5       SU S3$ )z)Load the random seed value from a buffer.load_seed_offsetr  z[0] + r"   )r   r   r$   seed_offsetinput)nameoffsetrM  s      r*   	load_seedPallasKernelOverrides.load_seed4  sH     hhmm//0BFK188==&&t,-VK=BBr-   c                    SU  SU SU S3$ )zGenerate uniform random numbers in [0, 1).

Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
The seed provides the base key, offset provides per-element uniqueness.
zWjax.vmap(lambda o: jax.random.uniform(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(8)), jnp.uint32(o)), (), dtype=jnp.float32))(jnp.asarray(!).flatten()).reshape(jnp.asarray().shape)r<   seedrP  s     r*   randPallasKernelOverrides.rand;  s,    @@Dv F"8#DVHHV	
r-   c                    SU  SU SU S3$ )zGenerate standard normal random numbers.

Uses JAX's threefry2x32 PRNG directly for vectorized random generation.
The seed provides the base key, offset provides per-element uniqueness.
zVjax.vmap(lambda o: jax.random.normal(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(rT  rU  rV  r<   rW  s     r*   randnPallasKernelOverrides.randnK  s,    @@Dv F"8#DVHHV	
r-   c                &    SU  SU SU SU SU S3$ )z,Generate random int64 values in [low, high).zWjax.vmap(lambda o: jax.random.randint(jax.random.fold_in(jax.random.PRNGKey(jnp.uint32(z)), jnp.uint32(o)), (), r!   z , dtype=jnp.int64))(jnp.asarray(rU  rV  r<   )rX  rP  lowhighs       r*   	randint64PallasKernelOverrides.randint64Z  s>    
@@DvE]^a]bbdeidj k"8#DVHHV	
r-   r<   )rk   r?   r>   r?   )r   r?   r   r?   r>   r?   )r   r?   r   r?   r   r?   r>   r?   )r   r?   r   zCallable[[], str]r   r   r>   r?   )NT)
rk   r?   r   torch.dtyper   zOptional[torch.dtype]r   r  r>   r?   )rk   r?   r   rc  r   rc  r>   r?   )r&   r=   r   rc  r>   r?   )r   rc  r>   r?   )rk   r?   r[  r?   r\  r?   r>   r?   )rk   r?   r  r?   r>   r?   )rk   r?   r  r?   r>   r?   )r   r?   r   r?   r'   r?   r>   r?   )rO  r?   rP  r?   r>   r?   )rX  r?   rP  r?   r>   r?   )
rX  r?   rP  r?   r_  r?   r`  r?   r>   r?   )|r@   rA   rB   rC   rD   staticmethodrl   rp   rs   rv   rz   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r"  r&  r)  r   r   r0  r3  r6  r:  r>  rB  rE  rH  rK  rN  rS  rV  r]  ra  rd  rg  rj  rm  rp  rs  rw  rz  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r%  r(  r+  r.  r1  r4  r7  r:  r=  r@  rC  rF  rI  rQ  rY  r\  ra  rE   r<   r-   r*   rg   rg   w   su	                      " " " " " "       ! !   ! !     ! !     ( (     ! !     ! ! ! ! 0 0 & & & & ( ( ( ( . . ; ;&  ,0"&	777 )7  	7
 
7 7 f f : : 6 6             ! ! E E 3 3
           ! ! ! ! $ $   , , + + ' ' , , ( ( & & % % * * e e
   6 6 6 6     # # - - . . 0 0 1 1 1 1 	
 	
 	
 	
 B B
 B B
 A A
 B B * * B B * * 7 7
 8 8 7 7 8 8 J J
 / / 3 3 4 4 6 6 
 
 
 
( 
 
 
 
 
 
 
 
" 
 
 
 
 
 
" 
 
 
 
 
 
 & & " " * * ) ) * * & & ! !     , , + + , , ' ' + + , , C C 
 
 
 
 
 
r-   rg   c                    ^  \ rS rSr% Sr\r\rS\	S'   U 4S jr
          S5S jrS6S jrS6S jrS6S	 jrS6S
 jrS7S jrS8S jrS9S jrS:S jrS9S jrS;S jrS<S jrS=S jrS>S jrS=S jrS?S jrS@S jr\SAS j5       rSBS jrSCS jrSDS jr     SES jr!      SFS jr"          SGS jr#          SHS jr$          SHS jr%              SIS  jr&SJS! jr'        SKS" jr(        SLS# jr)        SMS$ jr*SNS% jr+        SOS& jr, SP                 SQS' jjr-            SRS( jr.\/R`                  SSS) j5       r1S6S* jr2\/R`                   SP         STS+ jj5       r3SUS, jr4 SV     SWS- jjr5        SXS. jr6        SYS/ jr7          SZS0 jr8\S[S1 j5       r9SPS\S2 jjr:SPS]S3 jjr;S4r<U =r=$ )^PallasKernelie  a<  
Pallas kernel for elementwise operations with support for strided/scatter access.

Strategy:
- Convert index expressions to JAX-compatible array slicing
- Load/store using indexed access: "in_ptrX[slice]" or full-array "in_ptrX[...]"
- Compute expression with Python operators (compatible with jax.numpy broadcasting)
- Generate Python code that defines a Pallas kernel and a host entrypoint.
- Use async_compile.pallas path to compile and load Python code.

For GPU (Mosaic backend):
- Use masked loads/stores with power-of-2 block sizes to handle non-power-of-2 shapes
zCallable[[sympy.Expr], str]r   c                  > [         TU ]  " U0 UD6  [        R                  R	                  5       nUR
                  S:H  U l        S U l        U R                  U l        0 U l	        / U l
        0 U l        [        5       U l        SU l        g )NcudaF)superrZ   r   graphget_current_device_or_throwtypeis_gpuuse_masked_opsuse_warpgroup_paddingtensor_masksstore_with_outputload_index_exprsr   outputs_need_readhas_transposed_load)r%   r$   r^   device	__class__s       r*   rZ   PallasKernel.__init__w  sy    $)&)446kkV++/ &*[["8:792<, $) r-   c                    g)z)Check array bounds for indirect indexing.Nr<   )r%   r&   rO   loweruppers        r*   check_boundsPallasKernel.check_bounds  s    r-   c                    U R                  U5      n[        U[        R                  5      (       a  gUR                  (       a  [        U5      $ U R                  U5      $ )a  
Convert an index expression to a string suitable for Pallas indexing.

Pallas operates on full arrays, so we need to convert index expressions
to JAX array slicing. For example:
- x0 -> "..." (contiguous access, full array)
- 2*x0 -> "::2" (strided access with stride 2)
- 2*x0 + 1 -> "1::2" (strided access with offset 1, stride 2)

Args:
    index: The indexing expression to convert

Returns:
    The indexing string to use in generated code
...)r   r   sympySymbol
is_Integerr?   _convert_to_jax_slice)r%   indexprepared_indexs      r*   _get_index_strPallasKernel._get_index_str  sQ    " ..u5 nell33&&~&& --n==r-   c                   U R                   (       d  gU R                  U5      nUR                  [        5      (       a  U R	                  U5      $ [
        R                  R                  R                  U5      nU R                  U5      n[        U5      S:X  a  [        U5      $ [        U5      S:X  Ga  [        [        U5      5      n[        R                  " X5      n[        R                   " XC5      nUb  X-
  n[
        R                  R                  R                  U5      nUS:  a  U R	                  U5      $ US:X  a  gUS:w  a  U R	                  U5      $  [#        U5      nUS:  a  U R	                  U5      $  U R	                  U5       S3$ X-
  n[
        R                  R                  R                  U5      nUS:X  a  XC:X  a  gg[        U5      S:  aJ  SnU H9  n[        R                  " X5      n[        R                   " XC5      nUS:w  d  M7  Sn  O   U(       a  ggg! [$        [&        4 a    U R	                  U5      s $ f = f)a  
Convert a sympy index expression to JAX slice notation.

Handles common patterns like:
- stride*var -> ::stride
- stride*var + offset -> offset::stride

For more complex patterns, falls back to explicit indexing.
Uses BlockPatternMatcher for robust pattern matching.
r~  r   r   z::1TF)range_treesr   hasr
   r   r   rj  sizevarssimplify_get_used_iter_varslenr?   nextiterr   get_subexpr_involving_symbolmatch_affine_block_exprint	TypeError
ValueError)	r%   r  	used_varsr   var_exprstriderP  
offset_valall_unit_strides	            r*   r  "PallasKernel._convert_to_jax_slice  s#     $$U+ 99_%% ::e$$   ))%0,,U3	y>Qu:^q tI'C +GGSH )@@OF!)))226:A:::e,,Q;  Q;::e,,-!$VJ!A~#zz%00 &
 **V,-S11 )))226:Q;8? , + ^a #O .KKEW,DDXSQ;&+O !   A ":. -::e,,-s   !H$ $!IIc                    UR                   nU R                  5       nX#-  nXB:w  a  [        SU 35      eU R                  U5      nU$ )a~  
Generate JAX code to compute an index array for strided/complex indexing patterns.

For expressions like `2 * x3 + 32 * x2 + 256 * x1 + 1024 * x0`, we generate
code that computes the flattened index array using broadcasting.

The iteration variables (x0, x1, x2, x3) are already defined as jnp.arange arrays
in the kernel. We just need to convert the sympy expression to JAX code.
z9Pallas backend does not yet support mixed index pattern: )free_symbols_get_iter_varsrd   r   )r%   r  r  	iter_varsr  	index_strs         r*   _generate_strided_index$PallasKernel._generate_strided_index  s\     ))'')	 !,	$KE7S  JJu%	 r-   c                $    U R                  U5      $ )zr
Generate JAX code to compute an index array for complex indexing patterns.
Delegates to _generate_strided_index.
)r  r%   r  s     r*   _generate_index_array"PallasKernel._generate_index_array/  s    
 ++E22r-   c                H    [        U R                  R                  5       5      $ )z*Get the set of iteration variable symbols.)r   range_tree_nodeskeysr%   s    r*   r  PallasKernel._get_iter_vars6  s    $//44677r-   c                <    UR                   U R                  5       -  $ )z4Get iteration variables used in an index expression.)r  r  r  s     r*   r   PallasKernel._get_used_iter_vars:  s    !!D$7$7$999r-   c                6    [        U R                  U5      5      $ )z7Check if index expression contains iteration variables.)r  r  r  s     r*   _has_iteration_vars PallasKernel._has_iteration_vars>  s    D,,U344r-   c                    UR                    Vs/ s H&  n[        U5      R                  S5      (       d  M$  UPM(     sn$ s  snf )zDGet list of indirect variable symbols (tmp*) in an index expression.tmp)r  r?   
startswith)r%   r  ss      r*   _get_indirect_varsPallasKernel._get_indirect_varsB  s3     --J-aQ1B1B51I-JJJs   #??c                <    [        U R                  U5      5      S:  $ )z6Check if index expression contains indirect variables.r   )r  r  r  s     r*   _has_indirect_varsPallasKernel._has_indirect_varsF  s    4**512Q66r-   c                   [        U R                  R                  5       5      n/ nU H6  u  p4U R                  UR                  5      nUc  M%  UR                  U5        M8     [        U5      S::  a  U$ [        [        U5      5      $ )a  Get the expected output shape from iteration variables.

Iteration variables are shaped for broadcasting. For 2D outputs:
- First var (e.g., y0) gets shape (1, N) - innermost dimension
- Second var (e.g., x1) gets shape (M, 1) - outermost dimension
The broadcast result is (M, N).
r   )listr  items	_safe_intlengthappendr  reversed)r%   	var_itemsbroadcast_varsvar_symentryr  s         r*   _get_expected_output_shape'PallasKernel._get_expected_output_shapeJ  s{     ..4467	'NG^^ELL1F!%%f- (
 ~!#!!
 H^,--r-   c                  ^ [         R                  R                  U5      nUc  gUR                  5       n[	        U5      S:w  a  g[        USS 5      " 5       nUc  g[        USS5      nUb  [	        U5      S:w  a  gU R                  US   5      nU R                  US   5      nUb  Ub  US::  d  US::  a  gU R                  US   5      n	U R                  US   5      n
U	b  U
c  g[        U R                  R                  5       5      n[	        U5      S:  a  g[        S	 U 5       5      (       a  gUS   S   nUS   S   n[         R                  R                  R                  U5      nU4S
 jmT" X,5      nT" X-5      nUbt  Ubq  X:H  =(       a    U
S:H  nU(       d  gU R                  5       nU(       d  g[        X-
  5      [        X-
  5      :  n[        X-
  5      [        X-
  5      :  nU=(       a    U$ g)a  Check if buffer access needs transpose.

Transpose on load is needed when:
1. Non-square buffers: dimensions are swapped relative to iteration vars
2. Square buffers: index coefficient pattern indicates transposed access
   (first iteration var has larger coefficient than second)
NFr   
get_layoutc                     g ra   r<   r<   r-   r*   <lambda>4PallasKernel._is_transposed_access.<locals>.<lambda>t      r-   r  r   r   c              3  >   #    U  H  u  pUR                   v   M     g 7fra   is_reduction).0_r  s      r*   	<genexpr>5PallasKernel._is_transposed_access.<locals>.<genexpr>  s     <)hau!!)s   c                .  > X:X  a  gU R                   (       a"  U R                   H  nT" X!5      nUc  M  Us  $    U R                  (       aJ  SnSnU R                   H-  nXQ:X  a  SnM  UR                  (       d  M  U[	        U5      -  nM/     U(       a  U$ g)z+Extract coefficient of var from expression.r   NFT)is_Addr$   is_Mul	is_numberr  )r&   r   termcoeffhas_varfactorget_coefficients         r*   r  ;PallasKernel._is_transposed_access.<locals>.get_coefficient  s    {{{ IID+D6E($ & {{"iiF}"&)))V,	 (
  Lr-   )r   rj  
get_bufferget_sizer  getattrr  r  r  r  anyr  r  _has_column_major_outputr   )r%   rO  r  buf_objbuf_sizelayout
buf_stridesize0size1s0s1r  	inner_var	outer_varinner_coeffouter_coeffis_standard_row_majoroutput_is_column_majorinner_matches_s0outer_matches_s1r  s                       @r*   _is_transposed_access"PallasKernel._is_transposed_accessb  s    ''$$T*?##% x=A,=?>VXt4
ZA!5x{+x{+=EMUaZ5A: ^^JqM*^^JqM*: ..4467	y>A <)<<< aLO	aLO	  ))%0	* &e7%e7"{'>$&K$;B!G!( &*%B%B%D")  #;#34s;;K7LL";#34s;;K7LL#8(88r-   c                   [        U R                  S0 5      nU H  n[        R                  R	                  U5      nUc  M'  [        USS 5      " 5       nUc  M?  [        USS5      nUb  [        U5      S:  a  M`  U R                  US   5      nU R                  US   5      nUc  M  Uc  M  Xg:  d  M    g	   [        R                  R                   H  n[        R                  R	                  U5      nUb  [        U[        5      (       d  M<  [        USS
 5      " 5       nUc  MT  [        USS5      nUb  [        U5      S:  a  Mu  U R                  US   5      nU R                  US   5      nUc  M  Uc  M  Xg:  d  M    g	   g)z:Check if any output buffer has column-major stride layout.output_buffersNr  c                     g ra   r<   r<   r-   r*   r  7PallasKernel._has_column_major_output.<locals>.<lambda>      Dr-   r  r   r   r   Tc                     g ra   r<   r<   r-   r*   r  r    r  r-   F)
r  r$   r   rj  r  r  r  name_to_bufferr   r   )r%   r  buf_nameout_bufr  
out_strideout_s0out_s1s           r*   r  %PallasKernel._has_column_major_output  s`    ,<bA&Hgg((2GWlLACF~ 48J!S_q%8^^JqM2F^^JqM2F!f&8V_ '  ..Hgg((2Gj.&I&IWlLACF~ 48J!S_q%8^^JqM2F^^JqM2F!f&8V_ / r-   c                   U R                  U5      nU R                  U5      nU(       a  U(       a  U R                  U5      S4$ U(       a  U R                  U5      S4$ U R	                  U5      nUR                  [        5      =(       a    US:g  nU(       d2  US:w  a,  SU;   d&  UR                  S5      R                  5       (       d  SnXE4$ )z@Get the index expression string and whether it needs flattening.TFr~  ::-)	r  r  _handle_mixed_indexingr   r  r  r
   lstripisdigit)r%   r  has_indirecthas_iter_varsr  needs_flattens         r*   _get_index_exprPallasKernel._get_index_expr  s    ..u5007M..u5t;;::e$e++++E2I "IIo6M9;MM !Y%%7	)Y-=-=c-B-J-J-L-L$(M++r-   c                   ^ ^ g)a  
Determine if we should use masked ops for this entire kernel.

Masked ops with pl.ds(block_size) flatten tensors to 1D, which works when:
1. We're on GPU (CUDA backend uses Mosaic which requires power-of-2 sizes)
2. All tensors are already 1D (so flattening doesn't change dimensionality)
3. All tensors have the same size (so broadcasting works correctly)

With per-tensor masks, each tensor gets its own mask based on its size.

This should be called once in codegen_kernel() before generating the kernel body.
F)r   updater$   input_buffersr  r  r   rj  r  r  r  tupler  r  	Exceptionr  allr   r  )r%   all_buffer_namesbuf_infor  bufrO   shape
total_sizer  int_sall_same_sizeis_power_of_2has_non_pow2r  dim
first_sizes   `              @r*    _determine_masked_ops_for_kernel-PallasKernel._determine_masked_ops_for_kernel  s    " r-   c                d    XR                   ;  a  SU 3nX R                   U'   U R                   U   $ )z2Get or create a unique mask variable for a buffer.mask_)rp  )r%   r  mask_vars      r*   _get_or_create_mask PallasKernel._get_or_create_maskL  s9    ,,,xj)H*2h'  **r-   c                J    U R                   c  U R                  5       U l         gg)zIInitialize masked ops strategy on first load/store if not yet determined.N)rn  r  r  s    r*   _ensure_masked_ops_initialized+PallasKernel._ensure_masked_ops_initializedS  s$    &"&"G"G"ID 'r-   c                F     [        U 5      $ ! [        [        4 a     gf = f)z0Convert value to int, returning None on failure.N)r  r  r  )r  s    r*   r  PallasKernel._safe_intX  s'    	s8O:& 		s   
   c                    SnU H;  nX0R                   ;   d  M  U R                  U R                   U   5      nUc    gX$-  nM=     U$ )zBCompute total numel for given prefixes (e.g., pointwise prefixes).r   N)numelsr  )r%   prefixesr3   r(   numels        r*   _compute_prefix_numel"PallasKernel._compute_prefix_numel`  sG    AKKt{{1~6=  r-   c                    SnU R                    H:  nUR                  (       d  M  U R                  UR                  5      nUc    gX-  nM<     U$ )zCompute total reduction numel.r   N)r  r  r  r,  )r%   r3   treer,  s       r*   _compute_reduction_numel%PallasKernel._compute_reduction_numelk  sK    $$D   tzz2= % r-   c                   [         R                  R                  U5      nUR                  5       nSnU H  nU R	                  U5      nXFb  UOU-  nM     / nSn[        USS 5      " 5       n	U	(       a  [        U	SS5      OSn
U
b  [        [        U5      5       H'  nU R	                  X   5      nUR                  U5        M)     [        U5      S:X  a  US   b  US   S:w  a  SnO[[        U5      S:  aL  Sn[        [        U5      S-
  S	S	5       H-  nX{   nUb  X:w  a  SnU R	                  X;   5      nUc  M)  X-  nM/     X#XGU4$ )
zRGet buffer metadata (buf_obj, buf_size, buf_numel, actual_strides, is_contiguous).r   NTr  c                     g ra   r<   r<   r-   r*   r  /PallasKernel._get_buffer_info.<locals>.<lambda>  r  r-   r  r   F)	r   rj  r  r  r  r  ranger  r  )r%   rO  r  r  	buf_numelr  svalactual_stridesis_contiguousr  r  iactual_strideexpected_stridedim_sizes                  r*   _get_buffer_infoPallasKernel._get_buffer_infov  sS   ''$$T*##%	A>>!$D!1q8I 
  ",=?8>WVXt4D
!3x=) $z} =%%m4 *
 8}!!!$0^A5F!5K$)MX""#s8}q0"b9A$2$5M$,0P(-#~~hk:H+'3 : )]JJr-   c                    U R                  U5      n/ nU HT  nX@R                  ;   d  M  U R                  U   nU R                  UR                  5      nUc  MC  UR	                  U5        MV     SnU H  nXx-  nM	     Xr4$ )zNCompute expected output numel and used vars from iteration variables in index.r   )r  r  r  r  r  )	r%   r  r  used_range_lengthsr   r  
length_valoutput_numells	            r*    _compute_output_numel_from_index-PallasKernel._compute_output_numel_from_index  s     ,,U3	C+++--c2!^^ELL9
)&--j9  #AL $ &&r-   c                    [        5       nU H[  n[        R                  " X5      n[        R                  " XT5      nUc  SnU R	                  U5      nUR                  Ub  UOU5        M]     U$ )zD
Extract coefficients of iteration variables from index expression.
r   )r   r   r  r  r  add)r%   r  r  coefficientsr   r  r  coefs           r*   _get_index_coefficients$PallasKernel._get_index_coefficients  sj     $.<C*GGSH(@@OF~>>&)DT%5T6B  r-   c                V   S/n[        U5      S:  aR  Sn/ n[        [        U5      S-
  SS5       H1  nUR                  SU5        U R                  X   5      nUc  M-  Xh-  nM3     U(       a  [	        U5      n	U H
  n
X;  d  M
    g   g[	        S U 5       5      nU H
  n
X;  d  M
    g   g)zB
Check if access pattern requires gather (non-standard striding).
r   r6  r   Tc              3  .   #    U  H  oc  M  Uv   M     g 7fra   r<   r  r  s     r*   r  5PallasKernel._check_gather_pattern.<locals>.<genexpr>  s     *Vn11ns   	F)r  r7  insertr  r   )r%   r  r:  r;  rK  expected_stridesr>  r<  r?  expected_stride_setrL  actual_stride_sets               r*   _check_gather_pattern"PallasKernel._check_gather_pattern  s     3x=1O!3x=1,b"5 ''?;>>(+6'#/O	 6 ",-=">$2 %  !+*Vn*V V$0 % r-   c                t  ^  US:w  d  U(       a  X44$ [         R                  R                  U5      nUc  X44$ T R                  U5      u  pgpn
T R	                  U5      u  pT R                  5       nT R                  X,5      nT R                  XyX5      n[        U 4S jU 5       5      n[        U5      [        U5      :  =(       a8    [        U5      S:  =(       a#    US:  =(       a    [        U5      [        U5      :  n[        R                  R                  R                  nU
(       + =(       a    [        S U	 5       5      n[        S U 5       5      nU=(       a    U(       + =(       a    X:H  nUS:  aC  X:w  d  U(       d  U(       a0  [        U5      S:  a!  U(       d  U(       d  T R!                  U5      S4$ X44$ )z
Check if buffer access needs strided indexing due to size mismatch or gather patterns.

This handles cases like:
- Pooling operations where input/output have different sizes
- im2col-like gather patterns
- Transposed or strided buffer access
r~  c              3  V   >#    U  H  nTR                  U5      S :w  d  M  S v   M      g7fr   N)r  )r  r  r%   s     r*   r  7PallasKernel._needs_strided_indexing.<locals>.<genexpr>  s#      OHqq8IQ8NHs   )	)r   r   c              3  (   #    U  H  oS Lv   M
     g 7fra   r<   rQ  s     r*   r  r\    s      <
#1aTM>   c              3  Z   #    U  H!  n[        U[        [        -  5      (       + v   M#     g 7fra   )r   r  r   )r  r'   s     r*   r  r\    s!     U1Jq#+$> > >s   )+T)r   rj  r  r@  rG  r  rM  rW  sumr  r  	_inductorr   _debug_cpu_to_tpu_pallasr  r  r  )r%   rO  r  r  r	  r  r  r  r8  r:  r;  rE  r  all_iter_varsrK  has_non_unit_stridesbuf_effective_dimsnot_all_vars_usedis_tpuis_known_non_contiguoushas_symbolic_coefskip_for_non_contiguouss   `                     r*   _needs_strided_indexing$PallasKernel._needs_strided_indexing  s     ++gg  &;++ !!$' 	D9m #'"G"G"N++-33EE  $99m 

 ! OH OO	NS// /I"/"Q&/ IX.	 	 ''@@&3"3 #
 <
#1<
 9
  UUU#PF
Py7P 	  1*.?CWI"+%//6<<''r-   c                   U(       d  US:X  a  X44$ [         R                  R                  U5      nUc  X44$ UR                  5       n[	        U5      S:X  a  SU4$ [	        U5      S:  a5  U R                  U5      nU(       d  US4$ SU;   a  U R                  U5      S4$ U R                  (       a  SU;   a  U R                  U5      S4$ X44$ )zP
Adjust index expression based on buffer shape (0-dim scalar, multi-dim, etc.).
r~  r   r   Tr  )r   rj  r  r  r  r  r  rm  )r%   rO  r  r  r	  r  r  r  s           r*   _adjust_index_for_buffer_shape+PallasKernel._adjust_index_for_buffer_shape&  s     I.++''$$T*?++##% x=A-'' x=1 44U;M  $&"33E:D@@ ;;49,//6<<''r-   c                   U(       a  U R                  U5      nSU SU S3$ U(       av  U R                  (       a	  SU SU S3$ UR                  [        R                  5      =(       d    UR                  [        R
                  5      nU(       a  SU S3OUn	U SU	 S	3$ U S
U S	3n
US:X  a#  U R                  X#5      (       a  SU
 S3n
SU l        U
$ )z3
Build the load expression based on indexing mode.
zpltriton.load(z.at[pl.ds(block_size)], mask=r"   .at[])r  z).astype(jnp.int64)z[...].flatten()[][r~  zjnp.transpose(T)r"  rm  r  r  MinMaxr  rt  )r%   r  rO  r  r  r	  
use_maskedr!  
has_minmaxidx	load_exprs              r*   _build_load_exprPallasKernel._build_load_exprL  s     //5H#C5(EhZqQQ{{'uD2>> #YYuyy1IUYYuyy5I
<F)$78I.se155 %q1-I E!d&@&@&M&M,YKq9	+/(r-   c                8  ^ UR                  S5      (       d  U$ [        U4S jU R                  R                   5       5      nU(       aP  [        R
                  R                  U5      mTb.  TR                  5       n[        U5      S:X  a  US   S:X  a  SU S3$ U$ )z
Squeeze (N,1) intermediate buffers when kernel has 1D graph inputs.

This avoids wrong broadcasting: (N,) op (N,1) -> (N,N) instead of (N,)
r  c              3     >#    U  Hh  nUR                  S 5      (       + =(       aF    [        R                  R                  U5      =mSL=(       a    [	        TR                  5       5      S:H  v   Mj     g7f)r  Nr   )r  r   rj  r  r  r  )r  r  r  s     r*   r  BPallasKernel._maybe_squeeze_intermediate_buffer.<locals>.<genexpr>{  sk      
 4 ##E** -GG..x88E-G$$&'1,- 4s   A0A3r   r6  r   zjnp.squeeze(z
, axis=-1))	r  r  r$   r  r   rj  r  r  r  )r%   rO  rz  has_1d_inputr  r  s        @r*   "_maybe_squeeze_intermediate_buffer/PallasKernel._maybe_squeeze_intermediate_bufferq  s     u%%  
 !II33	
 
 gg((.G""++-x=A%(2,!*;))J??r-   c                   [         R                  R                  U5      nUb  [        UR	                  5       5      S:w  a  U$ U R                  UR	                  5       S   5      nUc  U$ UR                  S5      (       a  U$ [         R                  R                  U5      nUb  UR                  (       d  U$ SnU R                  R                   H  n[         R                  R                  U5      n	U	c  M'  [        U	R	                  5       5      S:  d  MF  U	R	                  5        V
s/ s H  oR                  U
5      PM     nn
[        S U 5       5      (       a    OSnM     Ub  [        U5      S::  a  U$ U R                  U5      n[        U5      S:w  a  U$ [        [        U5      5      nXR                  ;  a  U$ U R                  U   nU R                  UR                   5      U:w  a  U$ U R                  R#                  5        VVs/ s H;  u  pU R                  UR                   5      U:X  d  M&  UR$                  (       a  M9  UPM=     nnn[        U5      S:w  a  U$ ['        U5       VV
s/ s H  u  noU:X  d  M  UPM     nnn
[        U5      S:w  a  U$ US   nU[        U5      S-
  :X  a  U$ S/[        U5      -  nSUU'   U SSR)                  [+        [,        U5      5       S	3$ s  sn
f s  snnf s  sn
nf )
zGReshape 1D buffers (e.g., batch norm mean) for higher-dim broadcasting.Nr   r   r  c              3  (   #    U  H  oS Lv   M
     g 7fra   r<   rQ  s     r*   r  :PallasKernel._maybe_broadcast_1d_buffer.<locals>.<genexpr>  s     ;l}lr^  r6  	.reshape(r!   r"   )r   rj  r  r  r  r  r  	get_dtypeis_floating_pointr$   r  r  r  r  r  r  r  r  r  	enumerater  mapr?   )r%   rO  r  rz  r  
buf_lengthr   ref_buf_sizer  	other_bufr  r  used_varr  vematching_varsr<  matching_dimsaxis_posreshape_dimss                        r*   _maybe_broadcast_1d_buffer'PallasKernel._maybe_broadcast_1d_buffer  s    ''$$T*?c'"2"2"45:^^G$4$4$6q$9:
 ??5!!!!$'U%<%< 		//H**84I$Y-?-?-A)BQ)F;D;M;M;OP;Oaq 1;OP;l;;;# 0 3|#4#9 ,,U3	y>QY(000 %%h/>>%,,':5
 --335
5~~ahh':5 >?nn 5 	 

 }" (1'>R'>tq!z/'>R}" #s<(1,,sS..!#XIdiiC0F&G%HJJQ  Q*
 Ss$   &K)	%K.2K.K.-K4=K4c                   US:w  d  U(       a  X#4$ U R                  U5      nU R                  5       nU R                  U5      n[        US5      (       a  UR                  O	[        5       U-  nXv-
  nU(       a  [        U5      S::  a  X#4$ Sn	U R                  R                  5        H  u  pU R                  U5      nU(       d  M  X:w  a  M&  U R                  U5      n[        US5      (       a  UR                  O	[        5       U-  nX:w  d  Xv:X  a  Mm  U R                  XU5      (       a  M  Sn	  O   U	(       a  U R                  U5      S4$ X#4$ )z
Check for im2col-like patterns where store uses block variables but load doesn't.

For cat/expand patterns, both load and store prepared indices share block vars.
For im2col patterns, store compresses to block vars but load doesn't.
r~  r  r   FT)r   r  r  hasattrr  r   r  rr  r  _check_load_is_strided_inputr  )r%   r  r  r	  r  r  store_orig_varsstore_prep_varsnew_varshas_im2col_patternr  
load_indexload_orig_vars	prep_loadload_prep_varss                  r*   _check_im2col_pattern"PallasKernel._check_im2col_pattern  sl    ++..u5'')	2259 ~~66 ''	
 #4 3/14++ #$($9$9$?$?$A H!55jAN! 0 --j9I 9n55 &&\	N /?3U 44n  &*"5 %B8 //?EE''r-   c                `   [         R                  R                  U5      nUc  g[        USS 5      " 5       nUc  g[        USS5      nUc  gUR	                  5       n/ nU H[  n	[
        R                  " X)5      n
[
        R                  " X5      nUc  M4  U R                  U5      nUR                  Ub  UOU5        M]     [        5       n[        U5       HJ  u  pU R                  X~   5      nUb  US:  d  M#  U R                  U5      nUR                  Ub  UOU5        ML     [        U5      U:H  $ )zL
Check if load coefficients match buffer strides (strided input vs im2col).
NFr  c                     g ra   r<   r<   r-   r*   r  ;PallasKernel._check_load_is_strided_input.<locals>.<lambda>      Dr-   r  r   )r   rj  r  r  r  r   r  r  r  r  r   r  rJ  )r%   r  r  r  r  r  buf_strides	buf_sizesload_coeffsr   r  rL  int_coefbuf_stride_setr<  r  r?  r  s                     r*   r  )PallasKernel._check_load_is_strided_input
  s     gg  *;lL9;>fh5LLN	 !C*GG
XH&>>xMD>>$/""x/C8N " $k*DA~~il3H8a<q)""E,=51E	 + +&.88r-   c                ^   U R                   (       a  g[        R                  R                  U5      nUc  g[	        USS 5      " 5       nUc  g[	        USS5      nUc  gUR                  5       n[        U5      S:w  d  [        U5      S:w  a  gU R                  US   5      nU R                  US   5      nU R                  US   5      nU R                  US   5      n	Ub  U	b  X:  a  Ub  Ub  US:  a  US:  d  gU R                  R                   H  n
[        R                  R                  U
5      nUc  M'  [	        USS	 5      " 5       nUc  M?  [	        USS5      nUb  [        U5      S:w  a  M`  U R                  US   5      nU R                  US   5      nUc  M  Uc  M  X:  d  M    g   g
)z
Check if output needs transpose for column-major storage.

Transpose on store is needed when:
- Output has column-major stride (s0 < s1)
- But input(s) have row-major stride
- And we haven't already transposed on load
FNr  c                     g ra   r<   r<   r-   r*   r  ;PallasKernel._check_store_needs_transpose.<locals>.<lambda>B  r  r-   r  r   r   r   c                     g ra   r<   r<   r-   r*   r  r  d  s    r-   T)
rt  r   rj  r  r  r  r  r  r$   r  )r%   rO  r  r  r  r  r  r  r  r  inp_nameinp_buf
inp_layout
inp_strideinp_s0inp_s1s                   r*   _check_store_needs_transpose)PallasKernel._check_store_needs_transpose2  s    ##gg  &;lL9;>VXt4
<<>z?a3x=A#5x{+x{+^^JqM*^^JqM* N!!		 		//Hgg((2G ,EGJ! Xt<J!S_%9^^JqM2F^^JqM2F!f&8V_ 0 r-   c                r    U(       a  U SU SU SU SU S3
$ U SU SU SU SU SU SU S	U S
U SU S3$ )zv
Build store expression for full array assignment.

Handles scalar broadcast, shape matching, and optional transpose.
[...] = (jnp.full(.shape, ) if jnp.asarray(z+).ndim == 0 else jnp.transpose(jnp.asarray(r   z/).ndim == 0 else (jnp.broadcast_to(jnp.asarray(r   z.shape) if jnp.asarray(z
).size != z.size else jnp.asarray(
).reshape(z	.shape)))r<   )r%   outvalueneeds_transposes       r*   _build_full_array_store_expr)PallasKernel._build_full_array_store_exprq  s     % 5/@ H227= % 5/@ H66;WCu E""'
3% 8$$)7*SE	Dr-   c	                   U(       a  U R                  U5      n	SU SU SU	 S3$ US:X  a#  U R                  U5      n
U R                  XU
5      $ U(       a;  US:X  a  SOSnU R                  (       a  SU S	U S
U S3$ U SU SU SU SU SU S3$ U R	                  U5      n[
        R                  R                  U5      nUbG  UR                  5       n[        U5      S:  a(  U R                  U5      (       d  U R                  XS5      $ U(       a]  US:X  a  SOSnSU SU SU SU S3	nUS:X  a4  U R                  R                  U5        U S3nU SU SU SU SU SU S3$ U SU SU 3$ U SU SU 3$ )zi
Build the store expression based on indexing mode.
mode can be None (set) or "atomic_add" (accumulate).
zpltriton.store(z.at[pl.ds(block_size)], z, mask=r"   r~  
atomic_addrJ  setrq  z], jnp.asarray(r   [...] = z[...].flatten().at[(z).flatten()].z(jnp.asarray(z).flatten()).reshape(.shape)r   Fz
(jnp.full(r  r  z).ndim == 0 else _aliasr  z.flatten()).reshape(rt  z] = )r"  r  r  rm  r  r   rj  r  r  r  r  rs  rJ  )r%   r  rO  r  r  r  r	  rw  moder!  r  
scatter_opr  r  r  
value_expralias_params                    r*   _build_store_exprPallasKernel._build_store_expr  s    //5H!#&>ugWXJVWX "??EO44SQQ"&,"6EJ{{(T)OE7RTUU e8C5(<YK}U_T` a##(')>se7L ..u5gg  &?||~H8}q )A)A%)H)H88UKK"&,"6EJYKxw 7""'(9%C  |#&&**3/!$Vne8K=0DYK}]g\hhi!l"6se7D
 a	{$zl;;a	{$ug..r-   c           
     f   UR                  SS5      nU R                  R                  U5        U S3nUS:X  a  SOSnU(       aw  US   n	US   n
US	   n/ n[        [	        U5      5       H,  nX:X  a  UR                  U	5        M  UR                  S
5        M.     SR                  U5      nU SU SU SU SU S3
$ US   n	US   nUS   n[        R                  R                  U5      nUb  [	        UR                  5       5      OSn[	        U5      [	        U5      -   n[	        U R                  5      nUS-
  nUU:H  =(       a    UU:H  nU(       a{  U VVs/ s H  u  nnUPM
     nnn[	        U5      n[	        U5      nUS:  a  US:  a  SU-  nSU-  nU	 SU SU S3nOU	nUR                  U5        UR                  S U 5       5        O;U Vs/ s H  nSPM     nnUR                  U	5        UR                  S U 5       5        SR                  U5      nU SU SU SU SU S3
$ s  snnf s  snf )zBBuild store expression for scatter operations (indirect indexing).is_point_scatterFr  r  rJ  r  indirect_varindirect_dimoutput_shape0r!   r  z	[...].at[z].r  r"   dims_before
dims_afterr   r   None, , Nonert  r~  rs  c              3  *   #    U  H	  u  pUv   M     g 7fra   r<   )r  var_namerO   s      r*   r  9PallasKernel._build_scatter_store_expr.<locals>.<genexpr>  s     IjNHxjs   :c              3  &   #    U  H  nS v   M	     g7f)r  Nr<   )r  r  s     r*   r  r    s     7JqsJs   )getrs  rJ  r7  r  r  r  r   rj  r  r  r  extend)r%   r  r  scatter_inforO  r  r  r  r  r  r  r  index_partsr  index_tupler  r  r  output_ndimnum_iter_vars_in_storetotal_kernel_iter_varsremaining_dimsis_element_wiser  rO   	n_leading
n_trailingleading_onestrailing_nonesindirect_reshapedr  s                                  r*   _build_scatter_store_expr&PallasKernel._build_scatter_store_expr  s    (++,>F 	""3'Vn #l2U
'7L'7L'7L KS./&&&|4&&s+	 0 ))K0KU(;-yR
|STUZT[[\]] $N3"=1!,/
 gg  &-0_c#,,.)!!$[!1C
O!C!$T%:%:!;$q #n4 A&*@@ 	
 :EF+$8+KF K(IZJ1}a')3!)J!6'3nAl^3~FVVW$X!$0!01IjII )4413K4|,7J77ii,e8K=	+bAeWTUV	
+ G  5s   H(H.c                H   U R                   R                  U5      n[        R                  R	                  U5      nX R
                  U'   U R                  5         U R                  U5      u  pVU R                  XXV5      u  pVU R                  XXV5      u  pVUS:H  =(       a    U(       + =(       a    U R                  SL nU R                  X1X%Xg5      nU(       d)  US:X  a#  U R                  X5      nU R                  XU5      nU R                  R                  U R                   UUS9$ )Nr~  Tr   )r$   rN  r   rj  r  rr  r%  r
  rk  rn  rn  r{  r  r  r   r   r   )	r%   rO  r  r  r   r  r	  rw  rz  s	            r*   loadPallasKernel.load  s%   iiood#!!$' ',d#++- $(#7#7#> 	 $(#?#?$
 	
 $(#F#F$
 	 T}#4T9L9LPT9T 	
 ))u
	
 e!3??PI77YOIxx  LL ! 
 	
r-   c                8	  ^^$^% U R                  T5      n[        U5      S:X  a  U R                  T5      $ U4S jn[        X#SS9nU Vs/ s H
  oS" U5      PM     nnU R                  U R	                  T5      5      nU R                  T5      nU V	s/ s H  n	[        U	5      PM     n
n	U Vs0 s H  n[        U5      U" U5      _M     nn[        U5      S:X  a  [        U
5      S:X  a  US   n[        U5      nXPR                  ;   =(       a    U R                  U   R                  nU(       ad  XPR                  ;   aS  U R                  U   nUR                  nU R	                  U5      nSU R                  U5       S3nUR                  UU5      nU$ Sn[        U
5      S:  a!  U R                  5       U-
  n[        U5      S:H  nU(       a  S[        U5      -   nU
 H+  nS	[        U5      -  nU S
U S3nUR                  UU5      nM-     [        U5       H  u  nn[        U5      nXPR                  ;   d  M"  U R                  U   nUR                  nU R	                  U5      nS/U-  nU R                  U5      UUS-   '   SR                  U5      nSU R                  U5       SU S3nUR                  UU5      nM     U$ / nU H  nUR                  U" U5      SU45        M     U H  n	UR                  U" U	5      SU	45        M     UR                  S SS9  [        U5       H  u  nn[        U5      nXPR                  ;   d  M"  U R                  U   nUR                  nU R	                  U5      nU" U5      m%SU R                  U5       S3n[!        U%4S jU 5       5      n[!        U%4S jUR#                  5        5       5      nUU-   nUS:  a  SU-  n U SU  S3nUR                  UU5      nM     U
 H  nUU   m$[!        U$4S jU 5       5      n![!        U$4S jU 5       5      nU!S:  a  US:  a  SU!-  n"SU-  n#U SU" SU# S3nO*U!S:  a  SU!-  n"U SU" S3nOUS:  a  SU-  n#U SU# S3nOUnUR                  UU5      nM     U$ s  snf s  sn	f s  snf )a9  
Handle indexing with both indirect variables and iteration variables.

For example, x[indices, :] generates index = i0 + stride * tmp0
where tmp0 is loaded from indices and i0 is the iteration variable.

We need to convert this to JAX advanced indexing with proper broadcasting.
When there are multiple iteration variables, they need different shapes
to form an outer product (grid) rather than broadcasting together.

Special case: For gather operations where a single iteration variable
and single indirect variable have the same extent, they should be
element-wise aligned, not broadcast into an outer product.

PyTorch advanced indexing semantics: When multiple indirect indices have
the same shape, they are paired element-wise (not outer product), and
the combined result dimension appears at the FRONT of the output.
r   c                   > TR                  U 5      nUS:X  a  [        R                  " TU 5      n [        U5      $ ! [        [
        4 a    [        S5      s $ f = f)z>Extract the coefficient of a variable in the index expression.r   inf)r  r  diffr  r  r  r   )r   r  r  s     r*   r  <PallasKernel._handle_mixed_indexing.<locals>.get_coefficientc  sU    KK$Ez

5#.$5z!z* $U|#$s   
< AATkeyreverser   jnp.arange(r"   Fz, 1z.reshape(-11r!   r  r  indirectc                    U S   $ )Nr   r<   rj   s    r*   r  5PallasKernel._handle_mixed_indexing.<locals>.<lambda>  s    !A$r-   c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr[  r<   r  r'   	var_coeffs     r*   r  6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>  s     %NAIaa   		c              3  6   >#    U  H  oT:  d  M
  S v   M     g7f)r   Nr<   r  s     r*   r  r    s      *7!y=AA7r  r  z[:rs  c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr[  r<   r  r'   indirect_coeffs     r*   r  r    s     I{!.6HAA{r  c              3  6   >#    U  H  oT:  d  M
  S v   M     g7fr[  r<   r  s     r*   r  r    s     J1>7IQQr  r  rt  r~  z...]z[...)r  r  r   sortedr   r  r?   r  r  r  replacer  r  r  r  sortr`  values)&r%   r  used_iter_vars_setr  used_iter_varsr   iter_coeffsr  indirect_var_symssymindirect_varsr  indirect_coeffsr  is_reduction_varrange_entry
range_sizerenamed_sizearange_exprpaired_indirectunused_iter_varsn_output_dimsr  trailing_onesreshape_exprr<  shape_parts	shape_strall_componentsn_trailing_itern_trailing_indirectr  trailing_dimsr  leading_nonesr  r	  r  s&    `                                  @@r*   r  #PallasKernel._handle_mixed_indexingI  s    & "55e<!"a'::e$$	$   2QUV7EF~s+~F JJt33E:;	 33E:->?->cS->? @QQ?P!3q6?1#55?PQ ~!#M(:a(? #C3xH,,,X1F1Fs1K1X1X   ///"&"7"7"<K!,!3!3J#'#7#7
#CL$/

<0H/I"KK ) 1 1(K HI    }!#2247II ""23q8O N 33M !. %N(; ;".{=/K%--lLI	 !. $N33s8///"&"7"7"<K!,!3!3J#'#7#7
#CL $'%-"7K)-L)AKA& $		+ 6I%djj&>%?z)TUV   !* 1 1(K HI# 4& 
 !C!!?3#7"EF "$C!!?3#7S"IJ %=  /FAs3xH+++"33C8(//
#33J?+C0	 +DJJ|,D+EQG #&%N%N"N&) *.557* '# -/BB
>$,z$9M%0MM?!"DK%--hD	5 0: *L,\:N I{IIIJJJJ 1}a (9 4!)J!6".qs>BRRSTQ (9 4".qtDa!)J!6".tN3C1E+!)),EI+ *. } G
 @ Rs   	RR,Rc                   Ub  US:w  a  [        SU S35      eU R                  R                  U5      nU R                  R	                  U5        U R                  5         [        R                  R                  U5      nUS L=(       a    [        UR                  5       5      S:H  nU(       a  U SU SU SU SU S	U S
3nOU R                  X!5      n	U	b  U R                  XSXU5      nOcU R                  U5      u  pU R                  X*U5      u  pU
S:H  =(       a    U(       + =(       a    U R                  SL nU R!                  XQX#XX5      nU R"                  R%                  U5        U R&                  R)                  XX45        g )Nr  zpallas store mode 'z' not supportedr   r  r  r  z).ndim == 0 else jnp.asarray(r  .shape))r~  T)rd   r$   outputstore_buffer_namesrJ  r%  r   rj  r  r  r  _detect_scatter_patternr  r
  r  rn  r  stores	writelinerq  r  )r%   rO  r  r  r  r  r  	is_scalar
store_exprr  r  r	  rw  s                r*   storePallasKernel.store	  s   
  4 3D6IJJiit$##D)++- gg  &tO@CLLN(;q(@	 % 5/@ H$$)7*SEC   77DL'!;;D

 ,0+?+?+F(	 ,0+E+Em,(	 & 4))4++t3  "33uYz
 	j)%%s&78r-   c                    UR                  U5      nUS:X  a  [        R                  " X5      n [        U5      $ ! [        [
        4 a     gf = f)z=Get integer coefficient of a variable in an index expression.r   )r  r  r  r  r  r  )r%   r  r   r  s       r*   _get_index_coefficient#PallasKernel._get_index_coefficientI	  sK    C A:JJu*E	u::& 		s   
: AAc                   U R                  U5      n[        U5      S:w  a  gUS   n[        U5      nU R                  X5      nUS:X  a  gU R	                  U5      (       d  U R                  X%U5      $ U R                  XU5      $ )zDDetect scatter operation pattern. Returns scatter info dict or None.r   Nr   )r  r  r?   r4  r  _detect_point_scatter_detect_iter_scatter)r%   r  output_nameindirect_symsindirect_symr  r	  s          r*   r,  $PallasKernel._detect_scatter_patternS	  s     //6}"$Q'<(44UIQ ''..--kXX ((nMMr-   c                   U(       d  g [         R                  R                  U5      nUR                  5        Vs/ s H  n[	        U5      PM     nn[        U5      S:  a  gSn[        U5      S-
  n[        [        U5      S-
  SS5       H  n	X7:X  a  U	n  OXvU	   -  nM     UU/ / SUS.$ s  snf ! [
         a     gf = f)z&Detect single-element scatter pattern.Nr   r   r6  Tr  r  r  r  r  r  )r   rj  r  r  r  r  r  r7  )
r%   r9  r  r	  r  r  r  
cumulativer  r  s
             r*   r7  "PallasKernel._detect_point_scatterh	  s     	''$$[1C,/LLN;NqCFNL; |q  
<(1,\*Q.B7C+"s++J	 8 )( $(
 	
! < 		s"   1B0 B+B0 +B0 0
B=<B=c                  ^ U R                  U5      n/ nU Hv  nU R                  X5      nUS:  d  M  X`R                  ;   d  M-  U R                  U R                  U   R                  5      nUc    gUR                  [        U5      Xx45        Mx     UR                  TUS45        UR                  S SS9  [        U4S j[        U5       5       S5      n	U	c  gSn
[        XYS-   S 5       H  u  pnXz:w  a    gX-  n
M     X::w  a  gTU	USU	  VVVs/ s H	  u  poU4PM     snnnXYS-   S  VVVs/ s H	  u  poU4PM     snnnS	SS
.$ s  snnnf s  snnnf )z0Detect scatter pattern with iteration variables.r   Nr6  c                    U S   $ )Nr   r<   rj   s    r*   r  3PallasKernel._detect_iter_scatter.<locals>.<lambda>	  s    AaDr-   Tr  c              3  D   >#    U  H  u  nu  n  o2T:X  d  M  Uv   M     g 7fra   r<   )r  r<  rO  r  r  s       r*   r  4PallasKernel._detect_iter_scatter.<locals>.<genexpr>	  s$     R&9?1ltQ\=QQQ&9s    	 r   Fr>  )r  r4  r  r  r  r  r?   r  r  r  r  )r%   r  r  r	  r  all_varsr   r  r  indirect_posexpectedr  r  rF  s     `           r*   r8  !PallasKernel._detect_iter_scatter	  sz    11%8 !C//;EqyS$9$99(=(=c(B(I(IJ>S5 9: " 	~r:;.$7 Ri&9R
   (2B2D)E FAf H !G % )(2:=L2IJ2IwqQF2IJ19:J:L1MN1MgaAq61MN % 
 	
 KNs   E	0Ec           	     P  ^  T R                   (       d   eUS:X  a  T R                  X5      $ [        U[        5      (       a  [	        S5      eX#U4nUT R
                  R                  ;   a  T R
                  R                  U   $ SSSSSSS	S
.n[        / SQ5      n[        U 4S jU 5       5      nT R                  U5      n	T R                  5       n
[        S T R                  R                  5        5       5      nUS:X  a(  U(       a  U	(       a  U
(       a  SU SU	 S3nGO<SU S3nGO4US;   Ga  Xc   nU=(       a    U	=(       a    U	S:  =(       a    U
nU(       GaW  US:  GaP  SnT R                  (       Ga0  [        [!        T R                  R#                  5       5      5      nT R                  R                  5        VVs/ s H  u  nnUR$                  (       d  M  UPM     nnnU(       a  US   nUR'                  U5      nUS:w  a  T R)                  U5      OSnUc  SnT R                  R                  5        VVs/ s H  u  nnUR$                  (       a  M  UPM     nnnU(       a>  US   nUR'                  U5      nUS:w  a  T R)                  U5      OSnUc  SnUU:  a  SOSnU SU SU S3nOU SU S3nOX6;   as  U=(       a    U	SL=(       a    U	S:  =(       a    U
nU=(       a    US:  =(       a    U	SL nU(       a  Xc   nSU SU SU	 SU
 S3	nOFU(       a  Xc    SU S3nO4Xc    SU S3nO)[	        SU S[+        UR-                  5       5       S35      eT R
                  R/                  T R0                  UUS 9nUT R
                  R                  U'   U$ s  snnf s  snnf )!a  
Generate code for reduction operations in JAX/Pallas.

Reductions in Pallas work by:
1. Loading the input data into the kernel
2. Applying JAX reduction operations (jnp.sum, jnp.max, etc.)
3. Storing the reduced result

The reduction happens over the loaded block of data.
welford_reducezHTuple reductions (e.g., welford_combine) not supported in Pallas backendzjnp.sumzjnp.prodzjnp.maxzjnp.minzjnp.anyz
jnp.argmaxz
jnp.argmin)r`  prodmaxminr  argmaxargmin)rk   r  zc              3  @   >#    U  H  oTR                   ;   v   M     g 7fra   )r*  )r  r(   r%   s     r*   r  )PallasKernel.reduction.<locals>.<genexpr>	  s     I6H,6Hs   c              3  P   #    U  H  u  pUR                   (       d  M  S v   M     g7fr[  r  )r  r   r  s      r*   r  rS  	  s       
 =*#ASASAA =s   &	&xor_sumzjnp.bitwise_xor.reduce(r  z, -1), axis=-1)r"   )rO  rP  r   r   r6  Nr  z, axis=z_pallas_partial_reduce(r!   z	, axis=0)zReduction type 'z8' not yet supported in Pallas backend. Supported types: z	, xor_sumr  )inside_reductionwelford_reduce_fallbackr   r  rd   r   reduction_cacher   r  r-  r1  r`  r  r  rr  r  r  r  r  r  r  r  r  r   r   )r%   r   r   reduction_typer  	cache_keyreduction_opspointwise_prefixeshas_pointwisepointwise_numelreduction_numeln_reduction_dimsreduction_exprreduction_opis_partial_reductionreduction_axisr  r   r  reduction_varsr_varr_coeffr_stridepw_varspw_varpw_coeff	pw_strideis_symbolic_partialr3   s   `                            r*   	reductionPallasKernel.reduction	  s/   " $$$$ --//==eU##Z 
 6	00088++I66 ""
 (8I6HII *.)C)CDV)W)-)F)F)H  
 $ 5 5 ; ; =
 
 Y&_#:5'?J[[j!k#:5'!C33 )8L  $#$#a'$ $	 ! $(81(< "$(((!%d4+@+@+G+G+I&J!KJ +/*?*?*E*E*G&*GJC -- *G # &
 & .q 1","2"25"9>El4>>'#:PQ#+'(H /3.C.C.I.I.K#.K
U#(#5#5  .K   #
 #%,QZF'1'7'7'?HDLPQMx(@WXI(0,-	 3;Y2FQBN$0>5'@PPQ!R %1>5'!;,  $#4/$#a'$ $	 ! R"2Q"6R?d;R   $  -<#:<.5'QSTcSddfgvfwwx!y$$1$A#B!E7)!T %2$A#B!E7!!L">"2 3$$(););)=$>#?yJ  ""LL # 
 /5  +U&#s   ?NN<N"N"c                    [         R                  R                  U 5      nUR                  5       nUR	                  5       $ ra   )r   rj  r  r  r;  )buffer_namer  r  s      r*   _buffer_is_contiguous"PallasKernel._buffer_is_contiguousW
  s1    gg  -!##%%r-   c                n9  ^!^X^Y [        5       nU R                  R                  5       u  p4  nU Vs/ s H  ofR                  PM     nnU Vs/ s H  oR	                  S5      (       d  M  UPM     n	nU Vs/ s H  oR	                  S5      (       d  M  UPM     n
n[        U R                  R                  R                  5       5      nU Vs/ s H  oU;   d  M
  UPM     nnU
(       d  [        S5      eU R                  R                  R                  5        VVs0 s H  u  p[        U[        5      (       d  M  X_M      nnnU=(       d    Sn[        R                  R                  5       R                   S:H  n["        R$                  R&                  R(                  nU(       aN  ["        R$                  R&                  R*                  (       d  [        S5      e[-        5       (       d  [        S5      eU(       a  SOS	nS
U(       d  SOS-   nUR/                  USS9  0 nU	 H[  nUR1                  U5      nUSL=(       a    U R3                  U5      nUU R4                  ;   nU(       + =(       a    U=(       d    UUU'   M]     U	 Vs/ s H  nUU   (       d  M  U S3PM     nnU Vs/ s H  oR	                  S5      (       d  M  UPM     nnUU-   nUU-   n[        UR                  5        VVs/ s H  u  nnU(       a  M  UPM     snn5      nU(       a  [7        [9        [;        U
5      5      5      n O([=        U
5       V!Vs/ s H  u  n!nUU;   d  M  U!PM     n n!nUU l        [        5       n"U"RA                  5          U RB                  (       GaT  U RD                  (       GaB  0 n#U R                  RF                  R                  5        H!  u  p[        U[        5      (       a  UOUU#U'   M#     U R                  R                  R                  5        H!  u  p[        U[        5      (       a  UOUU#U'   M#     [I        U RD                  R                  5       5       H  u  n$n%U#R1                  U$U$5      n&Sn'U H  nU&U:X  d  U$[        U5      ;   d  M  Un'  O   U'(       d  ME  U"RK                  SU$ 35        U"RK                  U% SU' S35        U"RK                  U% SU% S35        M     U RL                  (       Ga	  U RB                  (       Gd  U RN                  (       Gd  U"RK                  S5        [        U RL                  R                  5        V(s/ s HI  n([        U(RP                  [R        [T        RV                  45      (       d  M4  [S        U(RP                  5      PMK     sn(5      mYUY4S jn)/ n*U
(       a,  UR1                  U
S   5      n$U$(       a  U*RY                  U$5        U*R[                  U R                  RF                  5        Su  n+n,U* H  nU)" U5      n-U-S   (       d  M  U-u  n+n,  O   [7        U RL                  R                  5       5      n./ n/Sn0[=        U.5       HF  u  m!u  n1n2U R]                  U2RP                  5      n3U3b
  U3U,:X  a  T!n0M1  U/RY                  T!U1U2U345        MH     [;        U/5      n4[=        U.5       GH>  u  m!u  n1n2[        U15      n5U2RP                  n6U R_                  U65      n7U Ra                  U75      n8U R]                  U65      n3U3c  U+(       a  U4S:  a  T!U0:w  a  [c        U!4S j[=        U/5       5       S5      n9U9b  [e        S U/ 5       5      n:[e        S U/ 5       5      n;U:=(       a    U;n<U<(       a  U9n=OU4S-
  U9-
  n=S/U4-  n>U8U>U='   S Rg                  U>5      n?S!U8 S"3n@U"RK                  U5 S#U@ S$U? S"35        GM  U"RK                  U5 S%U8 S"35        GM&  U+(       aP  [;        U+5      S:  aA  U3U,:X  a;  S Rg                  S& U+ 5       5      n?S!U8 S"3n@U"RK                  U5 S#U@ S$U? S"35        GM}  U4S:  a  T!U0:w  a  [c        U!4S' j[=        U/5       5       5      n9[e        S( U/ 5       5      n:[e        S) U/ 5       5      n;U:=(       a    U;n<U<(       a  U9n=OU4S-
  U9-
  n=S/U4-  n>U8U>U='   S Rg                  U>5      n?S!U8 S"3n@U"RK                  U5 S#U@ S$U? S"35        GM'  U"RK                  U5 S%U8 S"35        GMA     U Rh                  Rj                   H  nAU"RK                  [        UA5      5        M     SSS5        U R                  R                  5       u  p4  nU Vs/ s H  ofR                  PM     nn[        U R                  R                  R                  5       5      nU Vs/ s H  oU;   d  M
  UPM     nnU Vs/ s H  oR	                  S5      (       d  M  UPM     nnUU-   nUU-   nS*U S+S Rg                  U5       3U RB                  (       a  S,OS-   S--   nBURK                  UB5        URA                  5          U"Rj                   HT  nA[        UA[        5      (       a!  URK                  WARm                  5       5        M9  URj                  RY                  WA5        MV     U Rn                   H  u  nCnDUCU;   d  M  URK                  WD5        M!     SSS5        U S.3nE/ nFS/[;        U5      -   nG[=        U5       H8  u  m!nUU;   d  UR	                  S05      (       d  M$  WFRY                  T!WG-   5        M:     WF(       a  S1S Rg                  S2 WF 5       5      -   S3-   nHOS4nH[7        [9        S/[;        U5      -   5      5      nIS1S Rg                  S5 UI 5       5      -   S3-   nJURK                  S6UJ S7WH S"35        S8S9/U-   U-   nKURK                  S*WE S1S Rg                  UK5       S-35        URA                  5          URK                  S:5        URK                  S;5        URK                  S<5        URK                  S"5        U RB                  (       a  URK                  S=5        URK                  S>5        URK                  S?5        U H  nURK                  S@U SA35        M     URK                  SB5        URK                  SC5        URK                  SD5        URK                  SE5        URK                  SF5        / nL[=        U
5       H  u  nMnUR	                  S5      (       aD  UR1                  USG5      (       a+  U S3nNURq                  UN5      nOWLRY                  UOWM45        M^  M`  URq                  U5      nOWLRY                  UOWM45        M     S Rg                  SH WL 5       5      nP/ nQU H  nRWQRY                  UR SIUR 35        M     U RB                  (       a  WQRY                  SJ5        WQ(       a  SKU SLS Rg                  WQ5       SM3nSOU SN3nSU RN                  (       G	ae  URK                  SO5        URK                  SP5        [=        U5       H  u  nTnURK                  SQU SA35        M     URK                  SB5        URK                  SR5        URK                  SS5        URK                  ST5        URK                  SU5        URK                  SV5        URK                  SW5        URK                  S5        URK                  SX5        URK                  SY5        URK                  SZ5        URK                  S[5        [=        U5       H  u  nTnURK                  S\UT S#U S35        URK                  S]UT S^UT S_35        URK                  S`UT SaUT Sb35        URK                  ScUT S#U Sd35        URK                  SeUT SfUT SgUT ShUT Si3	5        URK                  SjUT S"35        URK                  Sk5        URK                  SlU Sm35        M     URK                  Sn5        URK                  So5        URK                  Sp5        URK                  Sq5        URK                  Sr5        URK                  Ss5        URK                  St5        URK                  Su5        URK                  Sv5        URK                  Sw5        URK                  Sx5        URK                  Sy5        URK                  Sz5        URK                  S{5        URK                  S|5        URK                  S}5        URK                  S~WS-   5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  Sx5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        [=        U5       H  u  nTnURK                  SUT S#U S35        URK                  SUT S^UT S_35        URK                  SUT SaUT Sb35        URK                  SUT S#U Sd35        URK                  SUT SfUT SgUT ShUT Si3	5        URK                  SUT S"35        URK                  Sx5        URK                  SU Sm35        M     URK                  S~5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S~5        URK                  S5        URK                  SWS-   5        URK                  S5        URK                  S5        URK                  S5        URK                  Sk5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S~5        URK                  S5        URK                  S5        [=        U5       H  u  nTnURK                  SUT SU S35        URK                  SUT S35        URK                  SUT Sb35        URK                  SUT SUT SgUT S35        URK                  SUT S"35        URK                  Sx5        URK                  SUT S"35        M     URK                  S~5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  Sy5        URK                  Sz5        URK                  S5        URK                  S~5        URK                  S5        URK                  SWS-   5        URK                  S5        URK                  S5        URK                  S~5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        URK                  S5        OURK                  S5        URK                  SWS-   5        URK                  S5        URK                  SU S35        URK                  S5        URK                  WL(       a  SWP S3OS5        URK                  S5        U(       a$  URK                  SS Rg                  U5       S35        URK                  S"5        SSS5        U S3nUURK                  S*UU S1S Rg                  U5       S35        URA                  5          URK                  S5        URK                  S5        URK                  S5        U(       aQ  URK                  S5        U H:  nNU(       a  URK                  WN SUN S35        M#  URK                  WN SUN S35        M<     URK                  S5        U HR  nVUVR	                  S05      (       d  M  U(       a  URK                  WV SUV S35        M;  URK                  WV SUV S35        MT     URK                  S5        U H|  nVUVR	                  S5      (       d  M  U(       a  URK                  WV SUV S35        M;  U RB                  (       a  URK                  WV SUV S35        Me  URK                  WV SUV S35        M~     URK                  S5        U RB                  (       a<  URK                  SS Rg                  U
 Vs/ s H	  nSU S3PM     sn5      -   S3-   5        O;URK                  SS Rg                  U
 Vs/ s H	  nSU S3PM     sn5      -   S3-   5        URK                  SS Rg                  U
 Vs/ s H	  nSU S3PM     sn5      -   S3-   5        0 mXU H  nNUN S3TXUN'   M     U H  nVUV S3TXUV'   M     S8S9/nWUWR[                  U5        UWR[                  UX4S jU 5       5        URK                  SWE S1S Rg                  UW5       S"35        U (       a  URK                  S5        U  H~  m!U
T!   nU(       a+  URK                  ST! S35        URK                  U S35        M:  U RB                  (       a  URK                  U ST! SU S35        Mg  URK                  U ST! S35        M     SSS5        URs                  5       $ s  snf s  snf s  snf s  snf s  snnf s  snf s  snf s  snnf s  snn!f s  sn(f ! , (       d  f       GN= fs  snf s  snf s  snf ! , (       d  f       GN= f! , (       d  f       GN>= fs  snf s  snf s  snf ! , (       d  f       URs                  5       $ = f)a}  
Generate the complete Pallas kernel code as a Python string.

This includes:
- Import statements for JAX/Pallas
- The kernel function that operates on refs
- The main wrapper function that handles PyTorch<->JAX conversions via DLPack

Args:
    name: Optional kernel name (will use placeholder if not provided)

Returns:
    str: Complete Python source code for the Pallas kernel
out_ptr)ru  
in_out_ptrz2Pallas backend requires at least one output buffer<KERNEL_NAME>cpuzBPallas backend currently only supports using the first JAX device.zPALLAS_TARGET_TPU is set, but no TPU device was found. Please make sure that you have a TPU available and that JAX is configured correctly.r   r  a'  
import functools
import math
import torch
import jax
import jax.numpy as jnp
from jax.experimental import pallas as pl
from torch._inductor.runtime.runtime_utils import torch_dtype_to_jax_runtime
def _pallas_partial_reduce(reduce_fn, v, pw_numel, red_numel):
    # Helper for partial reductions: reorders axes and reduces
    # Returns result with keepdims-style shape for proper in-kernel broadcasting
    shape = tuple(v.shape)
    # Find contiguous axes whose product = red_numel (search from right)
    red_axes = None
    for i in range(len(shape) - 1, -1, -1):
        prod = 1
        for j in range(i, -1, -1):
            prod *= shape[j]
            if prod == red_numel:
                red_axes = list(range(j, i + 1))
                break
        if red_axes is not None:
            break
    if red_axes is None:
        red_axes = [len(shape) - 1]
    # Build output shape with 1s for reduced dimensions (keepdims style)
    out_shape = tuple(1 if i in red_axes else s for i, s in enumerate(shape))
    # Move pointwise axes to front, reduction axes to back
    pw_axes = [i for i in range(len(shape)) if i not in red_axes]
    reordered = jnp.moveaxis(v, pw_axes, list(range(len(pw_axes))))
    result = reduce_fn(reordered.reshape(pw_numel, red_numel), axis=-1)
    return result.reshape(out_shape)
z8
from jax.experimental.pallas import mosaic_gpu as plgpur  TstripNr  )rv  in_ptrz# Mask for z_size = z.sizez = jnp.arange(block_size) < _sizez*# Define iteration variables as JAX arraysc                  > [         R                  R                  U 5      nUb  [        UR	                  5       5      S::  a  g[        S UR	                  5        5       5      n[        R                  " U5      nUT;   a  X#4$ S$ )Nr   NNc              3     #    U  H7  n[        U[        [        R                  45      (       a  [        U5      OUv   M9     g 7fra   )r   r  r  IntegerrQ  s     r*   r  PPallasKernel.codegen_kernel.<locals>._get_nd_shape_if_matches.<locals>.<genexpr>  s5      "!/A #-Qemm0D"E"EA1L!/s   ?A)r   rj  try_get_bufferr  r  r  r   rL  )r  r  r  r,  iter_lengthss       r*   _get_nd_shape_if_matches=PallasKernel.codegen_kernel.<locals>._get_nd_shape_if_matches  sr    ''00:C{c#,,.&9Q&>)! "!$" E !IIe,E-2l-BE>TTr-   r   r~  r   c              3  H   >#    U  H  u  nu  n    nUT:X  d  M  Uv   M     g 7fra   r<   r  r<  vidxr  ry  s       r*   r  .PallasKernel.codegen_kernel.<locals>.<genexpr>R  s.      !">W(:?D!Q'+s{ %&A>W   "	"c              3  ^   #    U  H#  u  p  n[        U5      R                  S 5      v   M%     g7frNr?   r  r  r  r  s      r*   r  r  [  s/      9"6D
a %(F$5$5c$:$:6D   +-c              3  h   #    U  H(  u  p  n[        U5      R                  S 5      (       + v   M*     g7fr  r  r  s      r*   r  r  _  s2      9"6D
a ),A(9(9#(>$>$>6D   02r  r!   r  r"   z = r  z = jnp.arange(c              3  8   #    U  H  n[        U5      v   M     g 7fra   r?   rQ  s     r*   r  r  y  s     -S>Rc!ff>R   c              3  H   >#    U  H  u  nu  n    nUT:X  d  M  Uv   M     g 7fra   r<   r  s       r*   r  r    s.      -6O 2?D!Q#s{ A6Or  c              3  ^   #    U  H#  u  p  n[        U5      R                  S 5      v   M%     g7fr  r  r  s      r*   r  r    s+      1ESzqQCF--c22^r  c              3  h   #    U  H(  u  p  n[        U5      R                  S 5      (       + v   M*     g7fr  r  r  s      r*   r  r    s.      1IW:1AA 1 1# 666r  zdef z_kernel(z, *, block_sizez):_jit_wrapperr   rv  r  c              3  8   #    U  H  n[        U5      v   M     g 7fra   r  r  rk   s     r*   r  r    s     ,L^SVV^r  z,)z()c              3  8   #    U  H  n[        U5      v   M     g 7fra   r  r  s     r*   r  r    s     0PAQr  z+@functools.partial(jax.jit, static_argnums=z, donate_argnums=
out_shapes
out_dtypeszout_specs = tuple(z&    jax.ShapeDtypeStruct(shape, dtype)z3    for shape, dtype in zip(out_shapes, out_dtypes)zE# Calculate block_size aligned to warpgroup size (128) for Mosaic GPUz0# Find maximum flattened size across all tensorszmax_size = 0zmax_size = max(max_size, z.size)zfor shape in out_shapes:zC    tensor_size = shape[0] if len(shape) == 1 else math.prod(shape)z)    max_size = max(max_size, tensor_size)z<# Align to warpgroup size (128) for efficient GPU processingz6block_size = max(128, ((max_size + 127) // 128) * 128)Fc              3  4   #    U  H  u  pU S U 3v   M     g7f)z: Nr<   )r  r<  os      r*   r  r    s     )PK&1QCr!+Ks   =zblock_size=block_sizezfunctools.partial(z	_kernel, z),z_kernel,z7# Check if all tensors have same size (no broadcasting)z_all_sizes = []z_all_sizes.append(z    _numel = 1z    for s in shape:z        _numel *= sz    _all_sizes.append(_numel)z_unique_sizes = set(_all_sizes)zH_can_pad = len(_unique_sizes) == 1 and all(s > 1 for s in _unique_sizes)zif _can_pad:z5    # All tensors same size - safe to flatten and padz!    _orig_out_shapes = out_shapesz    _padded_inputs = []z    _orig_size_z    _aligned_size_z = ((_orig_size_z + 127) // 128) * 128z    if _orig_size_z != _aligned_size_r  z        _flat_z
.flatten()z        _padded_z = jnp.pad(_flat_z, (0, _aligned_size_z - _orig_size_r   z&        _padded_inputs.append(_padded_z	    else:z        _padded_inputs.append(z.flatten())z1    # Align output shapes to warpgroup size (128)z    _aligned_out_specs = []z    _is_scalar_output = []z4    for shape, dtype in zip(out_shapes, out_dtypes):z        _numel = 1z        for s in shape:z            _numel *= sz        if _numel <= 1:zI            _aligned_out_specs.append(jax.ShapeDtypeStruct(shape, dtype))z*            _is_scalar_output.append(True)z        else:z:            _aligned_numel = ((_numel + 127) // 128) * 128zU            _aligned_out_specs.append(jax.ShapeDtypeStruct((_aligned_numel,), dtype))z+            _is_scalar_output.append(False)z2    _aligned_out_specs = tuple(_aligned_out_specs)z    _result = plgpu.kernel(z        z%        out_shape=_aligned_out_specs,z    )(*_padded_inputs)z!    # Remove padding from resultsz&    if not isinstance(_result, tuple):z        _result = (_result,)z    _unpadded_results = []zV    for _res, _shape, _is_scalar in zip(_result, _orig_out_shapes, _is_scalar_output):z        if _is_scalar:z*            _unpadded_results.append(_res)z            _orig_numel = 1z            for _s in _shape:z!                _orig_numel *= _sz:            _unpadded = _res[:_orig_numel].reshape(_shape)z/            _unpadded_results.append(_unpadded)z\    return _unpadded_results[0] if len(_unpadded_results) == 1 else tuple(_unpadded_results)zelse:zA    # Different sizes - check if it's a reduction (scalar output)z    _out_numel = 1z    for s in out_shapes[0]:z        _out_numel *= sz    z    if _out_numel <= 1:zG        # Scalar output (reduction) - pad inputs but keep scalar outputz%        _orig_out_shapes = out_shapesz        _padded_inputs = []z        _orig_size_z        _aligned_size_z        if _orig_size_z            _flat_z            _padded_z*            _padded_inputs.append(_padded_z"            _padded_inputs.append(z#        # Scalar output - don't padz#        _aligned_out_specs = tuple(z.            jax.ShapeDtypeStruct(shape, dtype)z;            for shape, dtype in zip(out_shapes, out_dtypes)z	        )z        _result = plgpu.kernel(z            z)            out_shape=_aligned_out_specs,z        )(*_padded_inputs)z        return _resultzP        # Non-scalar output with broadcasting - broadcast inputs to output shapez%        _target_shape = out_shapes[0]z"        _target_numel = _out_numelz:        # Broadcast and flatten all inputs to target shapez        _broadcasted_z = jnp.broadcast_to(z, _target_shape).flatten()z' = ((_target_numel + 127) // 128) * 128z*        if _target_numel != _aligned_size_z = jnp.pad(_broadcasted_z - _target_numel))z/            _padded_inputs.append(_broadcasted_z5        # Align output shapes to warpgroup size (128)z        _aligned_out_specs = []z8        for shape, dtype in zip(out_shapes, out_dtypes):z            _numel = 1z            for s in shape:z                _numel *= sz6        _aligned_out_specs = tuple(_aligned_out_specs)z%        # Remove padding from resultsz*        if not isinstance(_result, tuple):z             _result = (_result,)z        _unpadded_results = []z;        for _res, _shape in zip(_result, _orig_out_shapes):z`        return _unpadded_results[0] if len(_unpadded_results) == 1 else tuple(_unpadded_results)zreturn pl.pallas_call(z    out_shape=out_specs,z    interpret=,z    grid=(1,),z    input_output_aliases={ z },z    input_output_aliases={},z)(_mainz, stream=None):z/# Enable JAX x64 mode for float64/int64 supportz)jax.config.update('jax_enable_x64', True)zjax.clear_caches()z*# Convert Torch -> JAX for donated outputsz_jax = jax.device_put(z-.cpu().numpy(), device=jax.devices('tpu')[0])z_jax = jax.dlpack.from_dlpack(z
.detach())z+# Convert Torch -> JAX for in-place tensorsz!# Convert Torch -> JAX for inputsr{  z!.detach().contiguous().flatten())z.detach().contiguous())z-# Prepare output metadata from PyTorch tensorzout_shapes = (z(math.prod(z	.shape),)ztuple(r  zout_dtypes = (ztorch_dtype_to_jax_runtime(rR  _jaxc              3  .   >#    U  H
  nTU   v   M     g 7fra   r<   )r  rO  arg_name_maps     r*   r  r  M  s     $XDWD\$%7DWs   zres = z9result_values = res if isinstance(res, tuple) else (res,)z'res_cpu = jax.device_get(result_values[rr  z".copy_(torch.from_dlpack(res_cpu))z'.copy_(torch.from_dlpack(result_values[z]).reshape(r)  z]))):r   r$   python_argdefsrO  r  r   r  r  RuntimeErrorr  r  r   r?   r   rj  rk  rl  r  ra  r   rb  !pallas_take_first_jax_device_onlyr	   splicer  rr  rs  r  r7  r  r  aliasable_out_ptrsindentrn  rp  r  r  r.  r  rm  r  r  r  r  r  r  r  r   r   r  r  r  r   _linesr  rq  r  getvalue)Zr%   rO  codearg_defs	call_argsr  r   kernel_paramsr(   pure_out_paramsoutput_paramssize_var_namessize_var_paramsouterinneroutput_buffer_lookupkernel_nameinterpret_is_cpurg  interpret_literalimportsaliasable_flagsparamrq  r;  
needs_readalias_paramspointer_tailkernel_input_paramsfull_kernel_paramsflagnon_alias_out_setcopy_output_indicesry  kernel_bodybuf_to_paramr  r!  
param_namematching_paramr  r  candidate_buf_namesreshape_target_shapereshape_target_numelr3   r  r  total_var_idxr  r  rD  num_broadcast_dimsr  r  renamed_length
length_strbroadcast_idxhas_reduction_varshas_pointwise_varsis_mixedaxis_idxr   r!  arangelinekernel_signatureru  
store_linejit_wrapper_namedonate_indicesbase_offsetdonate_literalstatic_argnumsstatic_argnums_literalwrapper_paramsalias_pairsout_idx
alias_name	input_idxalias_map_literalpartial_argssv_param
kernel_argr<  	main_nameptrwrapper_call_argsr  r  sZ                                    `                                                      @@r*   codegen_kernelPallasKernel.codegen_kernel]
  s     %)II$<$<$>!Q)12A2&3Om||I7N1mO$
$!5N(OA} 	 
 $DII$6$6$=$=$?@&3KmN7J1mKSTT !%		 8 8 > > @ 
 @%% EL @ 	  
 -o77>>@EEN''@@??))KK"X  "##"k  '7FG D $ HG$
J 	G4(+-$E.2259K't3 8R8R9M $"8"88J%%8=& E" % +:
*9_U=SugV/ 	 
 %
$!5M(NA} 	 
 +\9)M9&$3$9$9$;H$;jdD4T$;H
 "&uS-?'@"A "+=!9#!9IC,, !9   #
 #2
 %&!"""t'8'8'8!$(II$;$;$A$A$CLE3=eS3I3I%uL' %D$(II$<$<$B$B$DLE3=eS3I3I%uL' %E +11B1B1H1H1J*K&Hh!-!1!1(H!EJ%)N*%?h#a&.@-.N!	 + &~#--H:.FG#--'j0@F $--'j(DXJeT! +L0 $$$T-@-@-@%%&RS  * "&!6!6!=!=!?!?A%ahhemm0DE &AHH!? 	U ')# 377a8HIH+228<#**499+B+BC >H:$&:/D5d;FayyEKB,.B	 0 !!6!6!<!<!>?	 "$ $-6y-A)C)'5!%!=J!-*@T2T(+&--sGUJ.OP .B &)%8"-6y-A)C)'5"7|H"\\F%)%9%9&%AN!%N!;J!%!7J ")0 2Q 6 #} 4 -1!">G>W!"
 !%-M  -858 9"6D9" 6" 2 69 9"6D9" 6" 2 ,>+TBT#+/<H/AA/E/UH/2e6H.H8BH 5,0IIk,B	+6zl!)D + 5 5'/jF89YKq$Q!" !)#--
.TU.VW  - 459&*>> %)II-S>R-S$S	#.zl!!<#--'jF89YKqI ,a/C=4H(, -6?6O- ) .1 1ES1 .* .1 1IW1 .* $6#L:L#'4H (:A'='MH'*e.@&@0:H-$(IIk$:	#.zl!!<#--'jF89YKqI $--
.TU.VW} .BB ++%%c$i0 ,y "F %)II$<$<$>!Q)12A2#DII$6$6$=$=$?@&3KmN7J1mK$
$!5M(NA} 	 
 +\9)M9
 ;-x		2D(E'FG$($7$7 RA 	
 	'([[]#**dC((NN4;;=1KK&&t, + (,'='=#00NN:. (>   *],7#o.."#67IC$)F)F%%cK&78 8  499,L^,L#LLtSN!NeAO(<$<=>!$tyy0P0P'P!PSW!W''=&>>Oa!	
 <(?:=PP 	 	./q>1J0K2NO[[]NN/0NNCDNNPQNN3
 ""[ QR~.0ENN%>ugV#LM 1 9:Y JKR WX13K!*=!9??9--&**477(,vV_
$7$=$=j$I	#**Iw+?@ 8
 !4 9 9$ ?I&&	7';< ": !%		)PK)P P L+##xj($<= ,""##$;<1+i		R^H_G``bc
 +}H5
 {{{ M 01 )*= >HAuNN%7wf#EF !?9:/04545>?@A^ r"~.VWBC89 )*= >HAuNN_QCs5'#GHNN,QC/?sBWX NN%7s:LQCq#QRNN^A3c%
#KLNN*1#->qcAUVWUXXfghfiikl NN%KA3a#PQNN;/NN%CE7+#VW !? RS<=;<UV34898989_ KL/P k LMST<=zJ67FG78BCGH=>;<l 78KL/<=>?BCP PQr w'W 34<=89v&89] FG<= )*= >HAuNN%83ugU#KLNN03CA3F[\ NN%;A3>PQRPSST#UVNN%7s#eWJ#OPNN.qc1B1#EYZ[Y\\jkljmmop NN%OPQsRS#TUNN?3NN<UG;O !? z*DEDEOPQ {+z*@A~
:;JK;<78{+f FGCDFGz*P <= )*= >HAuNN/s2FugMgh NN03Z[ NN%OPQsRS#TUNN.qc1I!L`ab`ccuv NN%OPQsRS#TUNN?3NNI!AN !?  z*VW@AN 78<=<=P k WXz*@A~
:;JK;<z*FGKLAB?@Q <=>?BCP PQv 78v
239:0A/B!DE/0" 33D2ETJ7
 t$&NNT$))4G*H)I#KLs#_ b #m5)	9+Qtyy);<=_M	
 [[]NNLMNNFGNN/0KL".J
 )l*@Lyz )l*HT^_ #/ NNHI#>>,//"e#9#>kl "e#A#jQ $ NN>?#>>(++"e#9#>kl ,,"e#A#Fgh "e#A#F]^ $  NNJK""$iiCPQ=4;tfI6=Q 	 $iiM RMD6$w!7M RST
 NN )) %2$1D 6dV7C$1 	 ,.L*
.8\+>Z( +#'*e4LS! $ ".| <$$_5$$$XDW$XXNNV$4#5QtyyAR7S6TTUVW"O /C(-DEcU"M $/Q'RS,,#f$KC5P[\`[aaij #f$KC5PST /w X }}i 3O

 L 
X

 I#hS "!H 3K
 ]P ]V
 R !SG ]X }}sa  ApAp(Ap4ApAp
	ApApAp"7Ap"?Ap(	Ap( Ap-=Ap-'Ap2
9Ap2
:Ap8Ap85DAqAqB.Aq3Ap>
;Ap>
BAqLAqAq 	Aq-Aq9AqAq9A;Aq$8Aq$0qAq6bB6Ard=A%Arf&B"AriAri,ArjArj+Arj?ArkD'Arp>Aqq
Aqq$
Aq3q6
ArrArr
Ar4c                2   [         R                  R                  nU R                  R	                  5       u  pE  nU Vs/ s H  owR
                  PM     nnU V	s/ s H  oR                  S5      (       d  M  U	PM     n
n	[        [        [        U5      5      n[        U S0 5      nU
 V	s/ s H/  n	UR                  U	S5      (       d  M  XR                  U	5         PM1     nn	U SSR                  X-   5       S3nUR                  U5        gs  snf s  sn	f s  sn	f )z7Generate the Python code that calls this Pallas kernel.ru  r  Fz.run(r!   r"   N)r   rj  wrapper_coder$   r  rO  r  r  r  r?   r  r  r  r  r.  )r%   rO  nodewrapperr  r  r  r   kernel_param_namesr(   r  call_arg_strs	aliasablealias_call_argskernel_calls                  r*   call_kernelPallasKernel.call_kernelf  s    ''&&$(II$<$<$>!Q.67hffh7&8T&8LL<S1&8TSi01D"6;	 %
$}}Q& 7M22156$ 	 
 eDIIo.M$N#OqQ+& 8T
s   D
D5D'DD)	r  rt  rm  rr  rs  rq  rp  rn  ro  )
r&   r=   rO   r=   ry  r  rz  r  r>   None)r  r=   r>   r?   )r>   r   )r  r=   r>   r   )r  r=   r>   r  )r  r=   r>   zlist[sympy.Symbol])r>   r  )rO  r?   r  r=   r>   r  )r>   r  )r  r=   r>   tuple[str, bool])r  r?   r>   r?   )r>   r  )r  r   r>   Optional[int])r+  r   r>   r  )r>   r  )rO  r?   r>   z tuple[Any, Any, Any, list, bool])r  r=   r>   ztuple[int, OrderedSet])r  r=   r  r   r>   r   )
r  r  r:  r  r;  r  rK  r   r>   r  )
rO  r?   r  r=   r  r?   r	  r  r>   r  )r  r?   rO  r?   r  r=   r  r?   r	  r  rw  r  r>   r?   )rO  r?   rz  r?   r>   r?   )rO  r?   r  r=   rz  r?   r>   r?   )r  r=   r  r?   r	  r  r>   r  )r  r?   r  r=   r  r   r>   r  )rO  r?   r>   r  )r  r?   r  r   r  r  r>   r?   ra   )r  r?   rO  r?   r  r=   r  r   r  r?   r	  r  rw  r  r  r   r>   r?   )r  r?   r  r   r  zdict[str, Any]rO  r?   r  r   r>   r?   )rO  r?   r  r=   r>   r   )
rO  r?   r  r=   r  r   r  r   r>   r  )r  r=   r   zsympy.Symbolr>   r  )r  )r  r=   r9  r?   r>   Optional[dict[str, Any]])r9  r?   r  r?   r	  r  r>   r  )r  r=   r  r?   r	  r  r>   r  )
r   rc  r   rc  rY  rI   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r>   r  )rq  r?   r>   r  )rO  rb   r>   r?   )rO  r?   r  zOptional[IRNode]r>   r  )>r@   rA   rB   rC   rD   rg   	overridespallas_pexprr   __annotations__rZ   r{  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r"  r%  rd  r  r-  r1  r@  rG  rM  rW  rk  rn  r{  r  r  r  r  r  r  r  r  typing_extensionsoverrider  r  r1  r4  r,  r7  r8  rn  rr  r  r  rE   __classcell__)rv  s   @r*   rf  rf  e  s    &I)5E&5)(88&089=8FJ8	8!>F]~638:5K7.0\|"H,,Pd+J
  		#KJ''	'(,6	 ## # 	#
 !# 
#J@(@( @( 	@(
 @( 
@(D$($( $( 	$(
 $( 
$(L## # 	#
 # # # 
#J4@K@K *@K7:@K	@KD;(;(,/;(@D;(	;(z&9&9)3&9EO&9	&9P=~*=A	@ B/B/ B/ 	B/
 B/ B/ B/ B/ B/ 
B/HN
N
 N
 %	N

 N
 N
 
N
` *
  *
XFP LP5959 *593>59FI59	59  59n 57NN.1N	!N*

.1
CF
	!
B+
+
/2+
DG+
	!+
Z__ _ &	_
 ;_ 
5_B & &
GR' 'r-   rf  c                  F    \ rS rSr\r\SS j5       r        SS jrSr	g)PallasSchedulingi{  c                6    [        [        R                  /5      $ ra   )r   r   REDUCE_TO_SINGLE_ELEMENT)clsru  s     r*   get_backend_features%PallasScheduling.get_backend_features~  s     >BBCDDr-   c                   [         R                  R                  nXR                  ;   a  UR                  U   $ [        R
                  R                  (       a$  [        U[        R
                  R                  5      OSn[        R                  " UR                  S5      5      R                  5       S S nUS:X  a  SU 3nOSU SU 3nXtR                  U'   UR                  SU5      n[        5       nUR                  SU< S	35        UR                  US
S9  UR                  S5        [!        X$5      u  pU	 SU
 3nUR#                  XxR%                  5       U5        U$ )Nr  zutf-8   fusedpallas_r  rw  zasync_compile.pallas(z, r'''Try  z''')
)r   rj  r  src_to_kernelr   tritondescriptive_namesr   hashlibsha256encode	hexdigestr  r   r.  r  r   define_kernelr  )r%   src_codenode_scheduler   r  
fused_namekernel_hashr  compile_wrapperoriginsdetailed_originsmetadata_comments               r*   r  PallasScheduling.define_kernel  sO    ''&&,,,((22 }}.. "-1P1PQ 	
 nnX__W%=>HHJ2AN #K=1K#J<q>K*5h' ##O[A(*!!$9+"OPxt4!!&)$7$O!%Yb)9(:;k+C+C+EGWXr-   r<   N)ru  ztorch.devicer>   zOrderedSet[BackendFeature])r  r?   r  zSequence[BaseSchedulerNode]r   rf  r>   r?   )
r@   rA   rB   rC   rf  kernel_typeclassmethodr  r  rE   r<   r-   r*   r  r  {  sF    KE E
"" 3" 	"
 
"r-   r  )rO   r  r>   r  )@
__future__r   r  r   r  typingr   r   r   r   r  r  torch.utils._ordered_setr   torch.utils._pallasr	   torch.utils._sympy.functionsr
   r  r   irr   runtime.runtime_utilsr   r   r   r   virtualizedr   block_analysisr   commonr   r   r   r   r   simdr   r   r   r#   r  collections.abcrF   rG   rH   ops_handlerrI   	schedulerrJ   MAIN_SUFFIXrN   rP   _logginggetArtifactLoggerr@   rX   rS   r  rd   rg   rf  r  r<   r-   r*   <module>r6     s    "    6 6   / . 8   6 >  /  -M < && 2+- 
 L ..228]K> >2U, Uk
K k
\S(': S('lP+~ +r-   