
    bi|_                     N   S SK JrJr  S SKJr  S SKJrJrJr  S SK	J
r
  S SKr " S S5      r\" SS	9 " S
 S\5      5       r\" SS	9 " S S\5      5       r\" SS	9 " S S\5      5       r\" SS	9 " S S\5      5       r\" SS	9 " S S\5      5       r\" SS	9 " S S\5      5       r\" SSS9 " S S\5      5       r " S S5      r\
S 5       r\" SS	9 " S S\5      5       r\" SSS9 " S S\5      5       r\" SSS9 " S  S!\5      5       r\" SS	9 " S" S#\5      5       rS&S$ jrS% rg)'    )	dataclassfield)List)_unwrap_if_constexpr_unwrap_shapeconstexpr_type)constexpr_functionNc                   8    \ rS rSrSr\S 5       r\S 5       rSrg)DistributedLayout	   z8
Base class for distributed memory layouts in Gluon IR.
c                     [        U 5      $ Nr   selfs    e/home/james-whalen/.local/lib/python3.13/site-packages/triton/experimental/gluon/language/_layouts.pytypeDistributedLayout.type       d##    c                     [        S5      e)Nz-DistributedLayout subclasses must define rank)NotImplementedErrorr   s    r   rankDistributedLayout.rank   s    !"QRRr    N)	__name__
__module____qualname____firstlineno____doc__propertyr   r   __static_attributes__r   r   r   r   r   	   s1     $ $ S Sr   r   T)frozenc                   0    \ rS rSrS rS r\S 5       rSrg)
AutoLayout   c                 "    UR                  5       $ r   )get_auto_layoutr   builders     r   _to_irAutoLayout._to_ir   s    &&((r   c                     g)NALr   r   s    r   mangleAutoLayout.mangle       r   c                     [        S5      e)NzAutoLayout has no rank
ValueErrorr   s    r   r   AutoLayout.rank    s    122r   r   N	r   r   r   r   r+   r/   r!   r   r"   r   r   r   r%   r%      s     ) 3 3r   r%   c                   0    \ rS rSrS rS r\S 5       rSrg)CoalescedLayout%   c                 "    UR                  5       $ r   )get_coalesced_layoutr)   s     r   r+   CoalescedLayout._to_ir(   s    ++--r   c                     g)NCLr   r   s    r   r/   CoalescedLayout.mangle+   r1   r   c                     [        S5      e)NzCoalescedLayout has no rankr3   r   s    r   r   CoalescedLayout.rank.   s    677r   r   Nr6   r   r   r   r8   r8   %   s     . 8 8r   r8   c                      ^  \ rS rSr% Sr\\   \S'   \\   \S'   \\   \S'   \\   \S'   \" \	S9r
\\\      \S'   U 4S	 jrS
 rS\4S jrS r\S 5       rSrU =r$ )BlockedLayout3   a  
Represents a blocked layout, partitioning a tensor across threads, warps, and CTAs.

Args:
    size_per_thread (List[int]): Number of elements per thread per dimension.
    threads_per_warp (List[int]): Number of threads per warp per dimension.
    warps_per_cta (List[int]): Number of warps per CTA per dimension.
    order (List[int]): The ordering of dimensions for partitioning.
    cga_layout (Optional[List[List[int]]]): Bases describing how CTAs tile each dimension.
size_per_threadthreads_per_warpwarps_per_ctaorderdefault_factory
cga_layoutc                 ,  > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [         TU ]  S[        U R                  5      5        [        U R                  5      n[        R                  U SU R                  5        [        U R                  5      U:X  d   e[        U R
                  5      U:X  d   e[        U R                  5      U:X  d   eg )NrE   rF   rG   rH   rK   )
super__setattr__r   rE   rF   rG   rH   lenobjectrK   )r   r   	__class__s     r   __post_init__BlockedLayout.__post_init__E   s    -/CDDXDX/YZ.0DTEZEZ0[\O-A$BTBT-UVG%9$**%EF4''(4t?4(()T1114%%&$...4::$&&&r   c                     UR                  U R                  U R                  U R                  U R                  U R
                  5      $ r   )get_blocked_layoutrE   rF   rG   rH   rK   r)   s     r   r+   BlockedLayout._to_irQ   s>    ))  !!JJOO
 	
r   returnc                 &   S nU" U R                   5      nU" U R                  5      nU" U R                  5      nU" U R                  5      nU R                  (       a"  SR                  S U R                   5       5      OSnSU SU SU SU SU S3$ )Nc                 H    U c  gSR                  [        [        U 5      5      $ N _joinmapstrxs    r   	stringify'BlockedLayout.mangle.<locals>.stringify\       y88CQK((r   r\   c              3   `   #    U  H$  nS R                  [        [        U5      5      v   M&     g7f~Nr]   .0vecs     r   	<genexpr>'BlockedLayout.mangle.<locals>.<genexpr>e   "     Q#chhs3}55   ,.r[   B)rE   rF   rG   rH   rK   r^   )r   rc   rE   rF   rG   rH   rK   s          r   r/   BlockedLayout.mangleZ   s    	)
 $D$8$89$T%:%:;!$"4"45$**%UYUdUdSXXQQQjl
?#1%5$6aawaPZ|[\]]r   c           
          [        [        U R                  5      [        U R                  5      [        U R                  5      [        U R
                  5      [        S U R                   5       5      45      $ )Nc              3   8   #    U  H  n[        U5      v   M     g 7fr   tupleri   s     r   rl   )BlockedLayout.__hash__.<locals>.<genexpr>j   s     -TOSeCjjO   )hashru   rE   rF   rG   rH   rK   r   s    r   __hash__BlockedLayout.__hash__h   s]    U4//0%8M8M2NPUVZVhVhPi4::&-TDOO-T(TV W 	Wr   c                 ,    [        U R                  5      $ r   )rO   rH   r   s    r   r   BlockedLayout.rankl       4::r   r   r   r   r   r   r    r   int__annotations__r   listrK   rR   r+   r`   r/   ry   r!   r   r"   __classcell__rQ   s   @r   rC   rC   3   s{    	 #Y3i99"'"=JT#Y=
'
^ ^W  r   rC   c                   |   ^  \ rS rSr% Sr\\S'   \\S'   U 4S jrS r	S\
4S jrS	 r\S
 5       r\S 5       rSrU =r$ )SliceLayoutq   z
Represents a layout corresponding to slicing a distributed tensor along one dimension.

Args:
    dim (int): The dimension index to slice.
    parent (DistributedLayout): The parent layout before slicing.
dimparentc                    > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        g )Nr   r   )rM   rN   r   r   r   r   rQ   s    r   rR   SliceLayout.__post_init__}   s5    E#7#ABH&:4;;&GHr   c                 l    UR                  U R                  U R                  R                  U5      5      $ r   )get_slice_layoutr   r   r+   r)   s     r   r+   SliceLayout._to_ir   s.    ''HHKKw'
 	
r   rW   c                 X    SU R                    SU R                  R                  5        S3$ )NSLr\   )r   r   r/   r   s    r   r/   SliceLayout.mangle   s)    DHH:Qt{{1134B77r   c                 D    [        U R                  U R                  45      $ r   )rx   r   r   r   s    r   ry   SliceLayout.__hash__   s    TXXt{{+,,r   c                 4    U R                   R                  S-
  $ )N   r   r   r   s    r   r   SliceLayout.rank   s    {{!##r   c                    U R                   R                  nU(       d  / $ U R                   R                  nSU R                  s=::  a  U:  d   e   eU Vs/ s H#  o3S U R                   X0R                  S-   S  -   PM%     sn$ s  snf Nr   r   )r   rK   r   r   )r   parent_cga_layoutr   basiss       r   rK   SliceLayout.cga_layout   s{     KK22 I{{DHH#t#####EVWEVEitxx 5A#77EVWWWs   *Br   r   r   r   r   r    r   r   r   rR   r+   r`   r/   ry   r!   r   rK   r"   r   r   s   @r   r   r   q   s[     
HI
8 8- $ $ X Xr   r   c                      ^  \ rS rSr% Sr\\\      \S'   \\\      \S'   \\\      \S'   \\\      \S'   \\   \S'   U 4S jrS	 r	S
 r
S r\S 5       rSrU =r$ )DistributedLinearLayout   a  
Represents a linear distributed layout with explicit bases at register, lane, warp, and block levels.
See: https://arxiv.org/abs/2505.23819 for reference.

Args:
    reg_bases (List[List[int]]): Bases for register-level distribution.
    lane_bases (List[List[int]]): Bases for lane-level distribution.
    warp_bases (List[List[int]]): Bases for warp-level distribution.
    block_bases (List[List[int]]): Bases for block-level distribution.
    shape (List[int]): The tensor global shape.
	reg_bases
lane_bases
warp_basesblock_basesshapec                   > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [        U R                  5      nU R                   H  n[        U5      U:X  a  M   e   U R                   H  n[        U5      U:X  a  M   e   U R
                   H  n[        U5      U:X  a  M   e   U R                   H  n[        U5      U:X  a  M   e   g )Nr   r   r   r   r   )	rM   rN   r   r   r   r   r   r   rO   r   r   r   rQ   s      r   rR   %DistributedLinearLayout.__post_init__   s   Kt~~)FGL-*HIL-*HIM=9I9I+JKG]4::%>?4::^^Eu:%%% $__Eu:%%% %__Eu:%%% %%%Eu:%%% &r   c                     UR                  U R                  U R                  U R                  U R                  U R
                  5      $ r   )get_distributed_linear_layoutr   r   r   r   r   r)   s     r   r+   DistributedLinearLayout._to_ir   s<    44T^^T__VZVeVegkgwgw59ZZA 	Ar   c                     SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )NDLLr\   )r   r   r   r   r   r   s    r   r/   DistributedLinearLayout.mangle   sI    T^^$Adoo%6a7H$JZJZI[[\]a]g]g\hhkllr   c                 P   [        [        [        [        U R                  5      5      [        [        [        U R                  5      5      [        [        [        U R
                  5      5      [        [        [        U R                  5      5      [        U R                  5      45      $ r   )rx   ru   r_   r   r   r   r   r   r   s    r   ry    DistributedLinearLayout.__hash__   sn    #eT^^,-#eT__-.#eT__-.#eT--./$**
  	r   c                 ,    [        U R                  5      $ r   )rO   r   r   s    r   r   DistributedLinearLayout.rank   r}   r   r   )r   r   r   r   r    r   r   r   rR   r+   r/   ry   r!   r   r"   r   r   s   @r   r   r      sv    
 DIT#YT#Yd3i 9&$Am  r   r   c                      ^  \ rS rSr% Sr\\S'   \\S'   \\S'   U 4S jrS r	S\
4S	 jrS
 r\S 5       r\S 5       rSrU =r$ )DotOperandLayout   z
Represents a layout for a dot operand.

Args:
    operand_index (int): 0 for LHS and 1 for RHS of the dot operation.
    parent (DistributedLayout): The parent layout, representing the MMA.
    k_width (int): Number of elements per 32-bits.
operand_indexr   k_widthc                    > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        g )Nr   r   r   )rM   rN   r   r   r   r   r   s    r   rR   DotOperandLayout.__post_init__   sP    O-A$BTBT-UVH&:4;;&GHI';DLL'IJr   c                     UR                  U R                  U R                  R                  U5      U R                  5      $ r   )get_dot_operand_layoutr   r   r+   r   r)   s     r   r+   DotOperandLayout._to_ir   s4    --d.@.@$++BTBTU\B]_c_k_kllr   rW   c                 r    SU R                    SU R                  R                  5        SU R                   S3$ )NDOr\   )r   r   r/   r   r   s    r   r/   DotOperandLayout.mangle   s6    D&&'q););)=(>a~RPPr   c                 Z    [        U R                  U R                  U R                  45      $ r   )rx   r   r   r   r   s    r   ry   DotOperandLayout.__hash__   s"    T''dllCDDr   c                 .    U R                   R                  $ r   r   r   s    r   r   DotOperandLayout.rank   s    {{r   c                 ~  ^ [        [        U R                  S/ 5      5      =(       d    / nU(       d  / $ U R                  R                  m[	        U4S jU 5       5      (       d   eU R
                  S:X  a  TS-
  OTS-
  nSUs=::  a  T:  d   e   e/ nU H#  n[        U5      nSXR'   UR                  U5        M%     U$ )NrK   c              3   @   >#    U  H  n[        U5      T:H  v   M     g 7fr   rO   )rj   r   r   s     r   rl   .DotOperandLayout.cga_layout.<locals>.<genexpr>   s     E3D%3u:%3Ds   r   r      )r   getattrr   r   allr   r   append)r   r   k_dimderivedr   	new_basisr   s         @r   rK   DotOperandLayout.cga_layout   s    0lTV1WX^\^ I{{E3DEEEEE ..!3qE D     &EUI INN9% ' r   r   r   r   s   @r   r   r      sc     LK
mQ QE      r   r   )r#   eqc                      ^  \ rS rSr% Sr\\   \S'   \\   \S'   \\   \S'   \" \	S9r
\\\      \S'   U 4S jrS	 rS
\4S jrS r\S 5       rSrU =r$ )NVMMADistributedLayouti
  aC  
Represents a layout for NVIDIA MMA (tensor core) operations.

Args:
    version (List[int]): Version identifier for the MMA instruction.
    warps_per_cta (List[int]): Number of warps per CTA.
    instr_shape (List[int]): Instruction shape for MMA.
    cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
versionrG   instr_shaperI   rK   c                   > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [        R                  U SU R                  5        g )Nr   rG   r   rK   )rM   rN   r   r   rG   r   rP   rK   r   s    r   rR   $NVMMADistributedLayout.__post_init__  sh    I';DLL'IJO-A$BTBT-UVM+?@P@P+QR4t?r   c                 z    UR                  U R                  U R                  U R                  U R                  5      $ r   )get_mma_layoutr   rG   rK   r   r)   s     r   r+   NVMMADistributedLayout._to_ir!  s5    %%LLOO	
 	
r   rW   c           	          U R                   (       a"  SR                  S U R                    5       5      OSnSU R                   SU R                   SU R                   SU S3	$ )Nr\   c              3   `   #    U  H$  nS R                  [        [        U5      5      v   M&     g7frg   r]   ri   s     r   rl   0NVMMADistributedLayout.mangle.<locals>.<genexpr>*  rn   ro   r[   MMA__MMA)rK   r^   r   rG   r   r   rK   s     r   r/   NVMMADistributedLayout.mangle)  sY    UYUdUdSXXQQQjl
dll^1T%7%7$8$:J:J9K1ZLX\]]r   c           	          [        [        U R                  5      [        U R                  5      [        U R                  5      [        S U R
                   5       5      45      $ )Nc              3   8   #    U  H  n[        U5      v   M     g 7fr   rt   ri   s     r   rl   2NVMMADistributedLayout.__hash__.<locals>.<genexpr>/       A#5::rw   )rx   ru   r   rG   r   rK   r   s    r   ry   NVMMADistributedLayout.__hash__-  sN    U4<<(%0B0B*CU4K[K[E\AAAC D 	Dr   c                 ,    [        U R                  5      $ r   )rO   rG   r   s    r   r   NVMMADistributedLayout.rank1  s    4%%&&r   r   r~   r   s   @r   r   r   
  sr     #Y9c"'"=JT#Y=@
^ ^D ' 'r   r   c                   (    \ rS rSrSr\S 5       rSrg)SharedLayouti6  z3
Base class for shared memory layouts in Gluon IR.
c                     [        U 5      $ r   r   r   s    r   r   SharedLayout.type;  r   r   r   N)r   r   r   r   r    r!   r   r"   r   r   r   r   r   6  s     $ $r   r   c                    U(       d  U $ [        U 5      n[        US   5      nS/U-  nU H8  n[        U5      U:X  d   e[        U5       H  n[        XF   XV   5      XF'   M     M:     [        U5       H  nXF==   S-  ss'   M     [        U5       H+  nX'   XG   -  S:X  d   SU  SU 35       eX'==   XG   -  ss'   M-     U$ )Nr   r   r   zShape z  is not divisible by CGA layout )r   rO   rangemax)r   rK   shape_per_ctar   	cga_shaper   ir   s           r   _get_shape_per_ctar   @  s    KMz!}Dd
I5zT!!!tAy|UX6IL  
 4[ T{!IN2a7u6%Hhisht9uu7y~-  r   c                      ^  \ rS rSr% Sr\\S'   \\S'   Sr\\S'   Sr\	\S'   Sr
\	\S	'   \" \S
9r\\\      \S'   U 4S jrS r\\SS j5       5       rS\4S jrS rSrU =r$ )NVMMASharedLayoutiT  a  
Represents a layout for shared memory suitable for NVIDIA MMA operations.

Args:
    swizzle_byte_width (int): Width in bytes for swizzling.
    element_bitwidth (int): Bitwidth of element type.
    rank (int): Rank of the tensor.
    transposed (bool): Whether the layout is transposed.
    fp4_padded (bool): Whether FP4 padding is used.
    cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
swizzle_byte_widthelement_bitwidthr   r   F
transposed
fp4_paddedrI   rK   c                 R  > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [         TU ]  S[        U R                  5      5        U R                  =(       d    / nU(       a  [        US   5      U R                  :X  d   e[         TU ]  S[        U R                  5      5        [         TU ]  S[        U5      5        U R                  S;   d   eU R                  S	;   d   eg )
Nr   r   r   r   r   r   rK   )          @   )r   r   r      )
rM   rN   r   r   r   r   r   rK   rO   r   )r   rK   rQ   s     r   rR   NVMMASharedLayout.__post_init__h  s    02FtG^G^2_`.0DTEZEZ0[\L*>t*OPL*>t*OP __*
z!}%222F$8$CDL*>z*JK$$777&&*::::r   c                     UR                  U R                  U R                  U R                  U R                  U R
                  U R                  5      $ r   )get_nvmma_shared_layoutr   r   r   r   rK   r   r)   s     r   r+   NVMMASharedLayout._to_iry  sC    ..##!!OOOOOOII
 	
r   c           	         U(       a  SOSnUc  U O
[        X5      n[        U 5      nU(       a  USS USS -   nUS   U-  nXR                  -  S-  n	U	S:  a  U	S-  S:X  a  Sn
O&U	S:  a  U	S-  S:X  a  Sn
OU	S	:  a  U	S	-  S:X  a  S	n
OSn
SnUSS  H  nX-  nM	     [        U 5      S:  d  US:  a  Sn
[        U
UR                  UUUUS
9$ )zReturns an NVMMASharedLayout with default swizzling for a given shape.

This picks the largest swizzle pattern compatible with the shape, which
allows emitting the fewest TMA or MMA messages.
r   r   Nr   r   r   r   r   )r   r   r   r   r   rK   )r   rO   primitive_bitwidthr   )block_shapedtyper   r   rK   packing_factorr   r   contig_dim_sizecontig_dim_bytesr   flatten_outer_dimsizes                r   get_default_for!NVMMASharedLayout.get_default_for  s    )a'1'9?QR]?j;)!"-bq0AAM'+n<*-E-EEJs"'7#'='B!$#(82(=(B!##(82(=(B!#!"!#2&D% '{a#4q#8!" 1"55!!!
 	
r   rW   c                     U R                   (       a"  SR                  S U R                    5       5      OSnSU R                   SU R                   SU R                   SU R
                   SU S3$ )Nr\   c              3   `   #    U  H$  nS R                  [        [        U5      5      v   M&     g7frg   r]   ri   s     r   rl   +NVMMASharedLayout.mangle.<locals>.<genexpr>  rn   ro   r[   NVMMA__NVMMA)rK   r^   r   r   r   r   r   s     r   r/   NVMMASharedLayout.mangle  sr    UYUdUdSXXQQQjl
//0$2G2G1H$//IZZ[\`\k\k[llmnxmyy  A  	Ar   c                     [        U R                  U R                  U R                  U R                  U R
                  U R                  (       a!  [        S U R                   5       5      45      $ S 45      $ )Nc              3   8   #    U  H  n[        U5      v   M     g 7fr   rt   ri   s     r   rl   -NVMMASharedLayout.__hash__.<locals>.<genexpr>  r   rw   )rx   r   r   r   r   r   rK   ru   r   s    r   ry   NVMMASharedLayout.__hash__  sl    T,,d.C.CTYYPTP_P_aeapapEI__UAAA` a 	aZ^` a 	ar   r   )FFN)r   r   r   r   r    r   r   r   r   boolr   r   r   rK   r   rR   r+   staticmethodr	   r  r`   r/   ry   r"   r   r   s   @r   r   r   T  s    
 D#MJJ"'"=JT#Y=;"
 #
  #
JA Aa ar   r   c                      ^  \ rS rSr% Sr\\S'   \\S'   \\S'   \\   \S'   \" \	S9r
\\\      \S'   U 4S	 jrS
 rS\4S jrS rSrU =r$ )SwizzledSharedLayouti  aQ  
Represents a generic swizzled shared memory layout.

Args:
    vec (int): Vector width for swizzling.
    per_phase (int): Elements per swizzle phase.
    max_phase (int): Maximum number of swizzle phases.
    order (List[int]): Dimension ordering for swizzling.
    cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
rk   	per_phase	max_phaserH   rI   rK   c                 `  > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [         TU ]  S[        U R                  5      5        [        R                  U SU R                  5        g )Nrk   r  r  rH   rK   )	rM   rN   r   rk   r  r  rH   rP   rK   r   s    r   rR   "SwizzledSharedLayout.__post_init__  s}    E#7#ABK)=dnn)MNK)=dnn)MNG%9$**%EF4t?r   c                     UR                  U R                  U R                  U R                  U R                  U R
                  5      $ r   )get_swizzled_shared_layoutrk   r  r  rH   rK   r)   s     r   r+   SwizzledSharedLayout._to_ir  s8    11HHNNNNJJOO
 	
r   rW   c                     S nU R                   (       a"  SR                  S U R                    5       5      OSnSU R                   SU R                   SU R                   SU" U R
                  5       SU S3$ )Nc                 H    U c  gSR                  [        [        U 5      5      $ rZ   r]   ra   s    r   rc   .SwizzledSharedLayout.mangle.<locals>.stringify  re   r   r\   c              3   `   #    U  H$  nS R                  [        [        U5      5      v   M&     g7frg   r]   ri   s     r   rl   .SwizzledSharedLayout.mangle.<locals>.<genexpr>  rn   ro   r[   SSS__SSS)rK   r^   rk   r  r  rH   )r   rc   rK   s      r   r/   SwizzledSharedLayout.mangle  sq    	)
 VZUdUdSXXQQQjl
dhhZq 0$..1A9TZZCXBYYZ[eZffjkkr   c           
          [        U R                  U R                  U R                  [	        U R
                  5      [	        S U R                   5       5      45      $ )Nc              3   8   #    U  H  n[        U5      v   M     g 7fr   rt   ri   s     r   rl   0SwizzledSharedLayout.__hash__.<locals>.<genexpr>  s     Ovfu_bPUVYPZPZfurw   )rx   rk   r  r  ru   rH   rK   r   s    r   ry   SwizzledSharedLayout.__hash__  sF    XXt~~t~~uTZZ7H%OvfjfufuOvJvwy 	yr   r   )r   r   r   r   r    r   r   r   r   r   rK   rR   r+   r`   r/   ry   r"   r   r   s   @r   r  r    s_    	 
HNN9"'"=JT#Y=@
l ly yr   r  c                      ^  \ rS rSr% Sr\\\      \S'   \\\      \S'   \\\      \S'   \\   \S'   U 4S jrS r	S	\
4S
 jrS r\\S 5       5       rS rSrU =r$ )PaddedSharedLayouti  a6  
Represents a layout for the access to shared memory. Compared to SwizzledSharedLayout,
it combined padding and element reordering via linear transformation (e.g. row permutation)
to avoid shared memory bank conflicts. After every interval tensor elements, the
corresponding number of padding elements are inserted. If a position corresponds to
multiple intervals, the padding amounts are summed.

In the following example of a tensor,
`eM` represents original elements in the and `pN` represents padded element.

Before padding, the shared memory looks like:
[e0, e1,
 e2, e3,
 e4, e5,
 e6, e7,
 ...]

After padding with interval-padding list [[2, 1], [4, 2]] with an identity remapping,
the shared memory will be
[e0, e1, p0,
 e2, e3, p1, p2, p3,
 e4, e5, p4,
 e6, e7, p5, p6, p7,
 ...]

Furthermore this encoding allows for a linear remapping from the 1-D shared
memory offset to logical n-D tensor elements. The remapping is given in the form
of linear bases mapping from offset to [dim0, dim1...dimN-1].
See LinearLayout.h for more details how linear layouts are applied to remap
elements.
Some concrete examples using `xN` and `yN` to mean the logical n-D tensor elements
and `pN` to mean padding:

After padding for shape = [8] with interval-padding list [[2, 2]], offset_bases = [[2], [1]] and block_bases = []:
[x0, x2, p0 p1, x1, x3]

After padding for shape = [8, 4] with interval_padding_pairs = [[8, 1]], offset_bases = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]] and block_bases = []:
[
    x0y0, x0y1, x0y2, x0y3,
    x2y0, x2y1, x2y2, x2y3,
    p0,
    x4y0, x4y1, x4y2, x4y3,
    x6y0, x6y1, x6y2, x6y3,
    p1,
    x1y0, x1y1, x1y2, x1y3,
    x3y0, x3y1, x3y2, x3y3,
    p2,
    x5y0, x5y1, x5y2, x5y3,
    x7y0, x7y1, x7y2, x7y3,
]

Args:
    interval_padding_pairs (List[int]): List of [interval, padding] pair and both interval and padding must be powers of 2.
    offset_bases (List[int]): Bases for shared memory offsets
    block_bases (List[List[int]]): Bases for block-level shared memory offsets.
    shape (List[int]): n-D logical shared memory shape
interval_padding_pairsoffset_basesr   r   c                   > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R
                  5      5        [         TU ]  S[        U R                  5      5        [        U R                  5      nU R                   H  n[        U5      U:X  a  M   e   U R
                   H  n[        U5      U:X  a  M   e   U R                  5         g )Nr2  r3  r   r   )	rM   rN   r   r2  r3  r   r   rO   verifyr   s      r   rR    PaddedSharedLayout.__post_init__%  s    4mDD_D_6`aNM$:K:K,LMM=9I9I+JKG]4::%>?4::&&Eu:%%% '%%Eu:%%% & 	r   c                     [        U R                  6 u  p#UR                  X#U R                  U R                  U R
                  5      $ r   )zipr2  get_padded_shared_layoutr3  r   r   )r   r*   	intervalspaddingss       r   r+   PaddedSharedLayout._to_ir4  s@    !4#>#>?	//	TEVEVX\XhXhjnjtjtuur   rW   c           	      p    SU R                    SU R                   SU R                   SU R                   S3	$ )NPaddedShared_r\   _PaddedShared)r2  r3  r   r   r   s    r   r/   PaddedSharedLayout.mangle8  sC    t::;1T=N=N<OqQUQaQaPbbcdhdndncoo|}}r   c                   ^ U R                   n[        U5      S:  d   S5       e[        S U 5       5      (       d   e[        U6 u  p#[	        [        U5      5      n[        U5      [        U5      :X  d   eS m[        U4S jU 5       5      (       d   S5       e[        U4S jU 5       5      (       d   S5       e[        U R                  5      nUS:  d   S	5       eg )
Nr   zVPaddedSharedLayout interval_padding_pairs must have at least one interval-padding pairc              3   >   #    U  H  n[        U5      S :H  v   M     g7f)r   Nr   )rj   pairs     r   rl   ,PaddedSharedLayout.verify.<locals>.<genexpr>>  s     4ed3t9>es   c                 .    U S:  =(       a
    X S-
  -  S:H  $ r   r   ns    r   <lambda>+PaddedSharedLayout.verify.<locals>.<lambda>D      !a%"<AQK1,<"<r   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r   rj   rG  is_power_of_2s     r   rl   rD  E  s     7Y=##Y   z;PaddedSharedLayout interval values must all be power of twoc              3   4   >#    U  H  nT" U5      v   M     g 7fr   r   rL  s     r   rl   rD  F  s     6X=##XrN  z:PaddedSharedLayout padding values must all be power of twoz*PaddedSharedLayout order must not be empty)r2  rO   r   r8  r   setr   )r   pairsr:  r;  unique_intervalsr   rM  s         @r   r5  PaddedSharedLayout.verify;  s    ++5zA~www~4e44444!5k	I/#$I666<7Y777v9vv76X666t8tt64::axEEExr   c           	        ^ [        U5      [        U5      :X  d   eS m[        U4S jU 5       5      (       d   e[        U5      n/ nU Hi  n[        [        [        R
                  " X   5      5      5       H8  nUR                  [        U5       Vs/ s H  owU:X  a  SU-  OSPM     sn5        M:     Mk     [        X/ U5      $ s  snf )zReturns a PaddedSharedLayout with the given interval and padding pairs and an identity mapping as the linear component for the given shape and order.
        c                 .    U S:  =(       a
    X S-
  -  S:H  $ r   r   rF  s    r   rH  6PaddedSharedLayout.with_identity_for.<locals>.<lambda>Q  rJ  r   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r   rL  s     r   rl   7PaddedSharedLayout.with_identity_for.<locals>.<genexpr>R  s     3U=##UrN  r   r   )rO   r   r   r   mathlog2r   r1  )	r2  r   rH   r   r3  r   r   r   rM  s	           @r   with_identity_for$PaddedSharedLayout.with_identity_forK  s    
 5zSZ'''<3U333335zCs499UZ#89:##ERVK$XKq8Q%Z%BK$XY ;  ""8ERR %Ys   Cc           
         [        [        [        [        U R                  5      5      [        [        [        U R                  5      5      [        [        [        U R
                  5      5      [        U R                  5      45      $ r   )rx   ru   r_   r2  r3  r   r   r   s    r   ry   PaddedSharedLayout.__hash__]  s`    U3ud&A&ABCU3uVZVgVgKhEi3ud&6&678%

:KM N 	Nr   r   )r   r   r   r   r    r   r   r   rR   r+   r`   r/   r5  r  r	   r[  ry   r"   r   r   s   @r   r1  r1    s    8r !cO+tCy/!d3i 9v~ ~F  S  S N Nr   r1  c                      ^  \ rS rSr% Sr\\\      \S'   \" \	S9r
\\\      \S'   Sr\\S'   U 4S jrS	 rS
\4S jrS rSrU =r$ )SharedLinearLayoutib  zGRepresents a shared memory layout defined via an explicit LinearLayout.r3  rI   r   r   	alignmentc                 `  > [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [         TU ]  S[        U R                  5      5        [        U R                  5      S:w  d   S5       e[        U R                  S   5      nUS:  d   S5       eU R                   H  n[        U5      U:X  a  M   e   U R                   H  n[        U5      U:X  a  M   e   U R                  S:  a   U R                  U R                  S-
  -  S:X  d   S5       eg )Nr3  r   ra  r   z1SharedLinearLayout offset_bases must not be emptyr   z<SharedLinearLayout alignment must be a positive power of two)rM   rN   r   r3  r   r   ra  rO   r   s      r   rR    SharedLinearLayout.__post_init__j  s   NM$:K:K,LMM=9I9I+JKK)=dnn)MN4$$%*_,__*4$$Q'(axLLLx&&Eu:%%% '%%Eu:%%% &~~!t~~!9K'LQR&R 	KJ	KR&Rr   c                 d    UR                  U R                  U R                  U R                  5      $ r   )get_shared_linear_layoutr3  r   ra  r)   s     r   r+   SharedLinearLayout._to_iry  s*    //0A0A4CSCSUYUcUcddr   rW   c                 V    SU R                    SU R                   SU R                   S3$ )NSharedLinear_r\   _SharedLinear)r3  r   ra  r   s    r   r/   SharedLinearLayout.mangle|  s1    t00143C3C2DAdnnEUUbccr   c           	          [        [        [        [        U R                  5      5      [        [        [        U R                  5      5      U R
                  45      $ r   )rx   ru   r_   r3  r   ra  r   s    r   ry   SharedLinearLayout.__hash__  sE    #eT../0#eT--./NN
  	r   r   )r   r   r   r   r    r   r   r   r   r   r   ra  rR   r+   r`   r/   ry   r"   r   r   s   @r   r`  r`  b  sY    QtCy/!#(#>Kd3i>IsKed d r   r`  c                     S/U-  nU (       d  U$ S nU  HL  n[        S [        U5       5       S 5      nUb  UnX6==   S-  ss'   M2  U(       a  M;  Uc   eX4==   S-  ss'   MN     U$ )Nr   c              3   :   #    U  H  u  pUS :w  d  M  Uv   M     g7f)r   Nr   )rj   r   vs      r   rl    bases_per_dim.<locals>.<genexpr>  s     ="2$!a1fAA"2s   	r   )next	enumerate)basesr   skip_broadcastresultnon_zero_idxr   idxs          r   bases_per_dimrx    sw    S4ZFL=)E"2=tD?LK1K+++ A%   Mr   c                     [        U [        5      (       a  [        U R                  [	        U5      5      $ [        U [
        [        45      (       a  [        U R                  U5      $ U R                  $ r   )	
isinstancer   rx  r   rO   r   r   rG   r   )layoutr   s     r   rG   rG     sW    &122V..E
;;	F[*:;	<	<V]]E22###r   )T)dataclassesr   r   typingr   triton.language.corer   r   r   triton.runtime.jitr	   rY  r   r%   r8   rC   r   r   r   r   r   r   r   r  r1  r`  rx  rG   r   r   r   <module>r     s   (  T T 1 S S $
3" 
3 
3 $
8' 
8 
8 $:% : :z $'X# 'X 'XT $6/ 6 6r $0( 0 0f $4 ('. (' !('V$ $  & $[a [a [a| $4 .y< .y !.yb $4 yN yN !yNx $! ! !J,$r   