
    ȅi%                     L   S SK r S SKJrJr  S SKrS SKJr  S SKJr  S SK	J
r
  SSKJrJrJr  SSKJr  SS	KJrJrJrJr  SS
KJrJrJrJr  SSKJrJrJrJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(J)r)J*r*  \(       a  SSKJ+r+  SSKJ,r,  \ RZ                  " \.5      r/\RH                  R`                  r0\S 5       r1\" S\1\
" S5      SS9r2\" \Rf                  S\0Rf                  Rh                  S9r5\" \Rf                  \Rl                  Ro                  5       (       a  SOSS\0Rf                  Rp                  S9r9\" \Rt                  S\0Rt                  Rh                  S9r;\Rx                  " \0Rf                  5      S!SS.S jj5       r=\Rx                  " \0Rt                  5      SSSS.S  j5       r>g)"    N)TYPE_CHECKINGUnion)counters)CKGemmTemplate)load_kernel_template   )configirlowering)MMKernelInputs)	loweringsmake_pointwisemake_reductiontransform_args)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)_use_cutlass_for_opuse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_bmm_templateuse_cutlass_templateuse_triton_template)opsV   )_is_static_problemis_batch_stride_largest_or_zeromm_argsuse_native_matmul)ChoiceCaller)KernelTemplatec                6    U" XS   5      U" X#S   5      -  U S4$ )NBLOCK_MBLOCK_Nr    )bmnmetacdivs        T/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/kernel/bmm.pybmm_gridr.   ,   s&    O$tAI'??AFF    bmm
triton_bmmT)namegridsource"cache_codegen_enabled_for_templatezat::bmm_out)op_overloadzat::_bmm_out_dtype_xpuzat::_bmm_out_dtype_cuda	bmm_dtype)r2   r6   zat::baddbmm_outlayoutc          
      (	  ^ ^ [        S T U4 5       5      (       Ga  T R                  5       S   S:X  d  UR                  5       S   S:X  aX  [        R                  " T S5      m [        R                  " US5      n[        R                  " [        R
                  " T U5      SS9$ S nS mU4S jnU" T 5      (       a0  [        R                  R                  R                  S	   nU" T U5      m U" U5      (       a/  [        R                  R                  R                  S   nU" X5      n[        T U5      (       a  [        [        R                     " T S5      m [        [        R                     " US5      n[        T U/0 S
SSS9u  p[        R                  R                   (       aU  T R"                  [$        R&                  [$        R(                  4;   a'  U 4S jn
U Vs/ s H  n[+        U
5      " U5      PM     nn[+        [,        R.                  5      " U6 n[1        S5      " US5      nU$ [3        T XUS9u  pnnm nSn[5        T U/US9nT R                  5       S	   n[6        S   SU SU SU SU 3==   S-  ss'   [8        R;                  SUUUUT R=                  5       UR=                  5       U5        [>        n0 nU(       a/  T RA                  5       RB                  S;   d   S5       e[D        nSU0n/ n/ n0 n[G        5       (       a   URI                  U5        UUURJ                  '   [M        USS9(       a,  Ub  UT R=                  5       :X  a  URI                  [N        5        URQ                  [        RR                  RU                  UUUUS95        [W        U5      u  nn[Y        T X5      nU(       aP  U(       aI  [[        X>UU5      (       a7  []        U5      (       a'  SSK/J0n  URc                  UUURe                  5       5        [g        UT U5      (       a'  SSK4J5n  URm                  UUURe                  5       5        [o        X>UU5      (       a&  [p        Rr                  " UUURe                  5       5        [u        UUURe                  5       U5      $ s  snf )zX
Lowering for autotuning aten.bmm with different backends (Aten, Triton, CUTLASS, etc.)
c              3   Z   #    U  H!  oR                  5       R                  S :H  v   M#     g7f)cpuN)
get_devicetype).0xs     r-   	<genexpr>tuned_bmm.<locals>.<genexpr>L   s     
>A<<>%'s   )+r   r   )axisc                     [         R                  " U 5      (       d  g[         R                  " U SS9u  p[        U[         R                  5      $ )NTF)freeze)r
   is_storage_and_layoutas_storage_and_layout
isinstanceFlexibleLayout)t_r9   s      r-   is_valid_to_require_contiguous1tuned_bmm.<locals>.is_valid_to_require_contiguousS   s=    ++A..005AIAfb&7&788r/   c                     US   S:H  =(       a    U S   S:H  =(       d    US   U S   :  =(       d)    US   S:H  =(       a    U S   S:H  =(       d    US   U S   :  $ )NrC   r   r'   )sizesstridess     r-    is_preferred_layout_as_bmm_input3tuned_bmm.<locals>.is_preferred_layout_as_bmm_inputY   sf     q QeBi1n&PuRy8PU"+"Sb	Q(R'"+r:RUr/   c                    > UR                   S   R                  5       nUR                   S   R                  5       nT" X#5      (       d  [        R                  R                  U 5      n U $ )Nval)r+   sizestrider
   ExternKernelrequire_contiguous)rK   meta_trQ   rR   rS   s       r-   may_require_contiguous)tuned_bmm.<locals>.may_require_contiguousc   sU    KK&++-Ekk%(//1G3ECCOO66q9Hr/   r   TNF)argskwargs	broadcasttype_promotion_kindconvert_input_to_boolc                 D   > [         R                  " U TR                  SS9$ )NF)use_compute_types)r   to_dtypedtype)r@   mat1s    r-   	_to_dtypetuned_bmm.<locals>._to_dtype   s    ||AtzzUKKr/   dot)r9   	out_dtyper0   )rk   aten_mm_infoz	aten.bmm_rL   zZTuned aten.bmm: batch=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)cudaxpuz+out_dtype is only supported for CUDA or XPUrk   check_max_autotune)kwarg_overrides)CUTLASS3xGemmTemplate)CppBmmTemplate);allget_sizeL	unsqueezesum_mulr   graphcurrent_noder^   r!   r   atenr   inductor_configtritoncodegen_upcast_to_fp32rf   torchfloat16bfloat16r   r   rj   r   r    r   r   loginfo	get_dtypeaten_bmmr=   r>   aten_bmm_dtyper   appenduidr   bmm_templateextendchoicesget_template_configsr   r   r   r   codegen.cuda.gemm_templaterr   add_cutlass_gemm_choicesnodesr   codegen.cpp_bmm_templaters   add_choicesr   r   add_ck_gemm_choicesr   )rg   mat2rk   r9   rM   r\   	meta_mat1	meta_mat2r^   r_   rh   r@   mul_pointwisedot_reductionr)   r*   kr2   kernel_inputs
batch_sizeaten_handleraten_extra_kwargsr   templates_to_userq   rL   
is_nonzerobatch_stride_largest_or_zerorr   rs   rS   s   `                             @r-   	tuned_bmmr   G   s,   
 
>$
>>>==?1"dmmoa&8A&=;;tR(D;;tQ'D66!%%d+!44	9	U	 *$//,,11!4I)$	:D)$//,,11!4I)$:Dt$$(r2(q1% $"'
 !!88TZZMMNNL
 >

L ;??$QN9-a0$D?&sww/6&u-mQ? #*dY#A!VT4 D #D$<9EM #J^yAaS!AaSABaGBHHd				 (0L %%8 	
9	
8 &()4"$G IKO-,=(()6e<Y$..*:: 	- NN			&&+	 	' 	
 'v.MAz#B4#V $ Aq11%%F66V]002	
 FD$//=""!	
 Fq!,,**7FM<O<O<QR$T7M4G4G4I6RRu @s   R)alphabetar9   c                   [        X5      (       a  US:X  a  SnO[        [        R                     " X@5      nUS:X  a  SnO9[        [        R                     " U[        [        R                     " X5      5      n[        [        R
                     " Xg5      $ [        XXS9u  pppn [        XU/[        X4S9S9nUR                  5       S   n[        S   SU SU SU	 SU
 3==   S-  ss'   [        R                  S	UUU	U
UR                  5       UR                  5       U R                  5       U5	        S
n/ n/ n[        5       (       a  UR                  [         5        [#        USS9(       a  UR                  [$        5        UR'                  [(        R*                  R-                  XU5      5        [/        XUR1                  5       U5      $ )zW
Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
r   r8   )r   r   )scalarsrl   zaten.baddbmm_rL   r   zkTuned aten.baddbmm: batch_size=%s, m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%sbaddbmmFro   )r!   r   r|   ry   r0   addr    r   dictru   r   r   r   r   r   r   aten_baddbmmr   r   r   r   r   r   r   r   )inprg   r   r   r   r9   arg1arg2r)   r*   r   r   r   r2   r   r   s                   r-   tuned_baddbmmr      s   
 $$19DTXX&t1DA:DTXX&ui.A$.MND"4.. (/t3'N$A!T #	D4e#?M
 #J^}ZL!AaS!EF!KFHHu			
 D"$G IK-6e<- NN			&&}M %TM4G4G4I6RRr/   )N)?loggingtypingr   r   r   torch._dynamo.utilsr   7torch._inductor.codegen.rocm.ck_universal_gemm_templater    torch._inductor.kernel.mm_commonr    r	   r}   r
   r   rv   r   r   r   r   r   r   select_algorithmr   r   r   r   utilsr   r   r   r   r   r   virtualizedr   r   	mm_commonr   r   r    r!   r"   r#   	getLogger__name__r   r|   r.   r   r0   outr   rn   is_available	dtype_outr   r   r   register_loweringr   r   r'   r/   r-   <module>r      sy    '  ( R A ; ; * P P   !  !1!yy~~ G G 		-'+	 eiiDHHLLQ#	II %		 6 6 8 8>W	""	 "	MM$$,,2B2B
 TXXVSD VS VSr T\\",-Ad 8S #8Sr/   