
    ȅi                     0
   S SK r S SKrS SKJrJrJr  S SKrS SKJr  S SK	J
r
  S SKJrJrJrJr  S SKJr  S SKJr  S SKJrJr  S S	KJr  S S
KJr  S SKJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)J*r*  SSK+J,r,J-r-J.r.J/r/  SSK0J1r1  SSK2J3r3J4r4J5r5J6r6J7r7  SSK8J9r9J:r:J;r;J<r<J=r=  SSK>J?r?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrK  SSKLJMrMJNrNJOrOJPrPJQrQJRrR   S SKSrS\" \SR                  5      rUSrV\R                  " \Y5      rZ\R*                  R                  r[\R*                  R                  r\\=" S\P\R                  R                  b  \US:  a  \N" S5      O\N" S5      SSS 9r_\=" S!\Q\N" S"5      S#9r`\=" S$\Q\N" S%5      S#9ra\=" S&\Q\N" S'5      S#9rb\=" S(\Q\N" S)5      S#9rc\ R                  S* 5       re\:" \R                  S+\[R                  R                  S,9rh\:" \R                  \R                  R                  5       (       a  S-OS.S/\[R                  R                  S09rl\:" \R                  S1\[R                  R                  S,9rn\:" \R                  S2\[R                  R                  S,9rp\:" \R                  S3S\[R                  R                  S49rs\:" \R                  S5\[R                  R                  S,9ruS6 rvSSSS7.S8 jrwShS: jrx\:" \wS5      ryS; rz " S< S=\*5      r{\{" 5       r| " S> S?\*5      r}S@ r~SA r\}" SBSC\~5      r\}" SDSE\5      r\6" \[R                  SSF9SiSSG.SH jj5       r\6" \[R                  SSF9SSG.SI j5       r\6" \[R                  SSF9SSSSJ.SK j5       r\6" \[R                  SSF9SSSL.SM j5       r\GR                  \GR                  4\GR                  \GR                  4\GR                  \GR                  4\GR                  \GR                  4\GR                  \GR                  4/r\GR                  \GR                  /r\GR                  \GR                  /rSN\S9\4SO jrSN\SP\S9\4SQ jrSN\SR\SS\SP\S9\4
ST jrSN\SR\S9\4SU jr SjSV\SW\GR&                  SX\SP\S9\4
SY jjrS9\4SZ jrS[\S\\S]\GR&                  S^\GR&                  S9\\\4   4
S_ jr\6" \[R                  R                  SSF9     SkS` j5       r\ R                  Sa\\   S9\4Sb j5       rSc r  SlSd\\   4Se jjrSf rSg rg! \W a    \" S5      rUSrV GN&f = f)m    N)AnyOptionalUnion)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)gen_best_config)opsV)make_fx)ScalingType)TorchVersion   )configdistributed_autotune)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphChoiceCallerSubgraphTemplate)BufferChoiceCaller	is_tritonLayout)MMKernelInputs)	loweringsmake_pointwisemake_reductionregister_loweringtransform_args)autotune_select_algorithmExternKernelChoiceKernelTemplaterealize_inputsTritonTemplate)_use_cutlass_for_opceildivuse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_nv_universal_gemm_template!use_triton_blackwell_tma_templateuse_triton_scaling_templateuse_triton_templateuse_triton_tma_template   )_is_static_problemload_kernel_templatemm_argsmm_gridpersistent_mm_griduse_native_matmulTz0.0.0Fmmz3.3.0	triton_mmtriton_mm_rocm)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmatriton_persistent_tma_mm)rB   rC   rD   %scaled_mm_device_tma_epilogue_scalingtriton_epilogue_scaled_mm&scaled_mm_device_tma_main_loop_scalingtriton_main_loop_scaled_mm"blackwell_ws_persistent_device_tma,triton_blackwell_ws_persistent_device_tma_mmc                     [        U 5      $ N)r'   )fns    S/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicerS   }   s    b!!    z
at::mm_out)op_overloadzat::_mm_dtype_out_xpuzat::_mm_dtype_out_cudamm_dtype)rB   rU   zat::addmm_outzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantrU   zat::_scaled_mm_outc                 d    U R                  5       [        R                  [        R                  4;   $ rP   )	get_dtypetorchint8uint8)mats    rR   _is_int8_matr^      s     ==?uzz5;;777rT   outalphabetac          	          U R                  S5      S:X  a  U R                  S5      S:w  d  U R                  S5      S:X  a  [        R                  " U S   XX4US9$ [        R                  " XX#XES9$ )z
Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
kernel under the hood.  There are a few shapes where this is slower,
but they are rare.
r   r8   r_   )stridesizerZ   addmm)inpmat1mat2r`   ra   rb   s         rR   
bias_addmmrj      s^     	

1sxx{a/CHHQK14D{{3q643$OO;;s$uHHrT   returnc                 z  ^ ^ S[         4S jnS[         4S jnS[         4S jn[        R                  " U" T R                  5       5      =(       d    U" T R	                  5       5      U 4S j5        [        R                  " U" TR                  5       5      =(       d    U" TR	                  5       5      U4S j5        g )Nrk   c                 \    [         R                  R                  R                  U S   S5      $ )Nr8   r   graphsizevarsstatically_known_equalsrd   s    rR   is_row_major.check_supported_striding.<locals>.is_row_major   #    ww77q	1EErT   c                 \    [         R                  R                  R                  U S   S5      $ Nr   r8   rn   rr   s    rR   is_col_major.check_supported_striding.<locals>.is_col_major   ru   rT   c                     [        [        R                  R                  R	                  U S   S5      =(       d-    [        R                  R                  R	                  U S   S5      5      $ rw   )boolr   ro   rp   rq   )re   s    rR   has_zero_dim.check_supported_striding.<locals>.has_zero_dim   sQ    GG44T!Wa@ Dww77QC
 	
rT   c                  *   > ST R                  5        3$ )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   rR   <lambda>*check_supported_striding.<locals>.<lambda>       6u7G7G7I6JKrT   c                  *   > ST R                  5        3$ )Nz$mat_b must be col_major, got stride r   )mat_bs   rR   r   r      r   rT   )r{   rZ   _checkr   get_size)r   r   rs   rx   r|   s   ``   rR   check_supported_stridingr      s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKrT   c                    U R                   S   nUR                   S   nU R                   S   nXR-  nUn[        R                  " U R                  X7U5      S5      nUR                  XvU5      n	[        R                  " X[        R
                  S9n
[        R                  " U
S5      nUR                  U R                  5      $ )Nr   r8   )r8   r   r   	out_dtype)	shaperZ   permutereshapebmmfloat32sumtodtype)abk_splitsmnkk_partsB
a_reshaped
b_reshapedresultreduced_bufs               rR   
decomposeKr      s    	
A	
A	
AmGAqyyw7CJ1q)JYYzGF))FA&K>>!''""rT   c                   N   ^  \ rS rSrU 4S jrS\\   S\S\S\	4U 4S jjr
SrU =r$ )	DecomposeKSugraphTemplate   c                     > [         TU ]  SS9  g )Ndecompose_krB   )super__init__)self	__class__s    rR   r   "DecomposeKSugraphTemplate.__init__   s     	 	
rT   input_nodeslayoutk_splitrk   c           	         > SSK Jn  SSKJn  SU S3nSU< 3nU" 5          U" 5       n[	        [
        R                  " [        US9U5      n	[        T
U ]%  UUUU	US	9sS S S 5        $ ! , (       d  f       g = f)
Nr   enable_python_dispatcherr   select_decomp_tabledecompose_k_mm__splitzk_split=)r   rB   r   r   make_fx_graphdescription)
torch._dispatch.pythonr   decompositionr   r   	functoolspartialr   r   generate)r   r   r   r   r   r   rB   r   decompositionsrQ   r   s             rR   r   "DecomposeKSugraphTemplate.generate   s     	D7 	0!
m%'02N!!*w?B
 7#' ' $  (''s   ;A&&
A4 )__name__
__module____qualname____firstlineno__r   listr   r   intr   r   __static_attributes____classcell__r   s   @rR   r   r      s<    

&\  	
 
 rT   r   c                   Z   ^  \ rS rSrS\S\S\4U 4S jjrS\\   S\	S\
4U 4S	 jjrS
rU =r$ )ContiguousTemplate   rB   r   rQ   c                 D   > Xl         X l        X0l        [        TU ]  US9  g )Nr   )rB   r   rQ   r   r   )r   rB   r   rQ   r   s       rR   r   ContiguousTemplate.__init__   s(    	& 	 	
rT   r   r   rk   c           	         > SSK Jn  SSKJn  U" 5          U" 5       n[	        U R
                  U5      n[        TU ]  U R                  UUUU R                  S9sS S S 5        $ ! , (       d  f       g = f)Nr   r   r   r   r   )
r   r   r   r   r   rQ   r   r   rB   r   )r   r   r   r   r   r   rQ   r   s          rR   r   ContiguousTemplate.generate  sg    
 	D7%'02NB
 7#YY'  ,, $  (''s   AA!!
A/)r   rQ   rB   )r   r   r   r   strr   r   r   r   r   r   r   r   r   r   s   @rR   r   r      sG    
S 
s 
 
&\  
	 rT   r   c                 J    [         R                  " XR                  5       5      $ rP   )rZ   r?   
contiguous)r   r   s     rR   contiguous_mmr      s    88A||~&&rT   c                 L    [         R                  " XUR                  5       5      $ rP   )rZ   rf   r   )rg   r   r   s      rR   contiguous_addmmr   $  s    ;;sq||~..rT   r   zcontiguous mmr   zcontiguous addmm)type_promotion_kindr   c                  ^  Ub  T R                  5       n[        R                  " UR                  5       U:H  S 5        [        R                  " T R                  5       R                  S;   S 5        [        R                  " X$:H  =(       d=    U[        R
                  :H  =(       a#    U[        R                  [        R                  4;   S 5        [        T U5      (       a  [        [        R                     " T S5      m [        [        R                     " US5      n[        T U/0 SSS	S
9u  pV[        R                  R                  (       aU  T R                   [        R                  [        R                  4;   a'  U 4S jnU Vs/ s H  n[#        U5      " U5      PM     nn[#        [$        R&                  5      " U6 n	[)        S5      " U	S5      n
U
$ [+        T XUS9u  ppm n[-        U5      u  pSn[/        T U/US9n[0        S   SU SU SU 3==   S-  ss'   [2        R5                  SUUUT R                  5       UR                  5       U5        / n[-        U5      u  p[6        n0 nUb
  [8        nSU0n/ n0 n[;        5       (       a'  UR=                  U5        U(       a  UUUR>                  '   UGc!  U(       Ga  [A        USS9(       Ga	  [C        XU5      (       a  UR=                  [D        5        [        RF                  S:H  nU(       d  [C        XUSS9(       d  UR=                  [H        5        [K        T XS9(       a  UR=                  [L        5        [O        T XS9(       a  UR=                  [P        5        [        RR                  " 5       (       a-  [        R                  RT                  (       a  SSK+J,n  U" U5      nUR=                  [Z        5        UR]                  [^        R`                  Rc                  UUSUS95        UcN  U(       aG  [e        X;X5      (       a6  [g        S5      (       a&  [h        Rj                  " UUURm                  5       5        Uc>  U(       a7  [o        X;X5      (       a&  [p        Rr                  " UUURm                  5       5        Uc>  U(       a7  [u        X;X5      (       a&  [v        Rx                  " UUURm                  5       5        Uc*  U(       a#  [{        X;XT U5      (       a  SSK>J?n  U" UUU5        Uc8  [        UT U5      (       a&  [        Rx                  " UUURm                  5       5        T U/nUGc5  U(       Ga-  [A        U5      (       Ga  [        R                  R                  R                  U5      (       a  [        T 5      (       a  / n[;        5       (       a  UR=                  S5        [        U5      nUR]                  [^        R`                  Rc                  U[H        /S5      5        [        T UUUUUUU[        5       SS US!9n[        R                  R                  R                  U5      (       d2  Ub*  [        U5      S:  a  U Vs/ s H  nUU;   d  M  UPM     nnOUSU nUcO  [        R                   H;  nUR=                  [        U5      R                  URm                  5       U5      5        M=     SnUc5  [        R                  R                  R                  (       a  [        T U5      n[        R                  " UUURm                  5       U5      =n (       a  U $ [        UUURm                  5       UUS"9$ s  snf s  snf )#zW
Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
Nc                      g)Nzinput dtypes must be the samer   r   rT   rR   r   tuned_mm.<locals>.<lambda>9  s    3rT   )cudaxpuc                      g)Nz+out_dtype is only supported for CUDA or XPUr   r   rT   rR   r   r   =  s    ArT   c                      g)NzFout_dtype must be the same as input dtype or fp32 for fp16/bf16 inputsr   r   rT   rR   r   r   E  s    \rT   r   TF)argskwargs	broadcastr   convert_input_to_boolc                 D   > [         R                  " U TR                  SS9$ )NF)use_compute_types)r   to_dtyper   )xrh   s    rR   	_to_dtypetuned_mm.<locals>._to_dtypef  s    ||AtzzUKKrT   dotr8   r   r   r?   r   aten_mm_infozaten.mm__zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   check_max_autotune
exhaustiver   )threshold_multipleoutput_layout)
append_tlxkwarg_overrides)add_nv_universal_gemm_choices	extern_mmzmm-ah
   )top_kalways_included)best_config_future)RrY   rZ   r   
get_devicetyper   float16bfloat16r>   r!   aten	unsqueezer%   inductor_configtritoncodegen_upcast_to_fp32r   r"   r   r   r#   r;   r9   r    r   loginfoaten_mmaten_mm_dtyper-   appenduidr6   r2   decompose_k_subgraph_templatemax_autotune_gemm_search_spacemm_templater7   persistent_tma_mm_templater4   .blackwell_ws_persistent_device_tma_mm_template	is_fbcodeenable_tlx_templates-torch._inductor.fb.tlx_templates.mm_templatesr   mm_contiguous_subgraph_templateextendr   choicesget_template_configsr1   r+   r   add_cutlass_gemm_choicesnodesr.   r   add_ck_gemm_choicesr/   r   add_choicesr3   codegen.nv_universal_gemmr   r0   r   	_inductorr   run_autoheuristicr   lenmm_autoheuristicr   collect_autoheuristicexternal_matmulrS   bindremote_gemm_autotune_cacher   r   maybe_autotune_remoter&   )!rh   ri   r   r   input_dtyper   r   r   r   mul_pointwisedot_reductionr   r   r   static_shape
is_nonzerorB   kernel_inputsr  aten_handleraten_extra_kwargstemplates_to_user   is_exhaustiver   r   r   r    num_choices_before_extra_configs
ah_choiceschoicer   boxs!   `                                rR   tuned_mmr6  0  sx   
 nn&NN+3	
 	OO""o5A	
 	$ U]]* CEMM5>>#BB\	
. t$$(r2(q1% $"'
 !!88TZZMMNNL
 >

L ;??$QN9-a0$D?&sww/6&u-mQ? #*dY#A!T4  2&9LD #D$<9EM ^xs!A3as3494HHY			 #%G1&9L'.L(*$()4HJ13O-0AOL,,- 	4@@!!**##$AB (FF,V 6qQST U##K0&tTH ''(BC0tR ''(VW  ))++#**??T#-.>#?  ?@NN			&&+	 	' 	
 	 A11%%66V]002	
 Z,@A,Q,Q**7FM<O<O<QRZ,EfQR,V,V&&w8K8K8MN 	*6aD$GGM%gv}E264FF##!	
 ,K''OO""44T::dOO """";/+.w<(II** 		
 &O+

 %%;;DAA%#j/A*=
 18Pf6Z;O6P!"C#CD 00ANN+A.33M4G4G4I6R 1
 U__33NN -T48"88g}**,f s  
$- o @D Qs   ]
],]c          	      $   [        XU[        R                  S9u  p4pRpSn[        S   SU SU SU 3==   S-  ss'   [        R                  SUUUU R                  5       UR                  5       U5        [        U5      u  pxU=(       a    U=(       a    [        X#XE5      n	/ n
[        X/[        R                  S9n/ n[        5       (       a  UR                  [        5        U(       a%  [        US	S
S9(       a  UR                  [        5        U
R                  [         R"                  R%                  XU5      5        U	(       a5  ['        U5      (       a%  [(        R*                  " XUR-                  5       S	S	S9  [/        XjUR-                  5       U5      $ )Nr   int_mmr   zaten._int_mm_r   r8   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   TF)enable_int32r   fuseablenon_fuseable)r;   rZ   int32r   r  r	  rY   r9   r1   r    r-   r  aten__int_mmr6   r  r  r   r  r  r+   r   r  r  r&   )rh   ri   r   r   r   r   rB   r+  r,  use_cutlassr  r-  r0  s                rR   tuned_int_mmr@  )  sn    #*6U[[#A!T D^}QCq1QC89Q>9HH^			  2&9LW:W2FvRS2WK"$G #D<5;;GM IK-)Te 	, NN			&&}M *40066]002TPT	
 %TM4G4G4I6RRrT   )ra   rb   r   c          	         [        X5      (       a  US:X  a  SnO[        [        R                     " X@5      nUS:X  a  SnO9[        [        R                     " U[        [        R                     " X5      5      n[        [        R
                     " Xg5      $ [        XXS9u  pppn[        U5      u  pSn[        XU/[        X4S9S9n/ n[        S   SU SU	 SU
 3==   S	-  ss'   [        R                  S
UU	U
UR                  5       UR                  5       U5        U(       a*  [        R                  (       dy  [        R                   (       dd  [        XU/[        X4S9S9nUR#                  [$        R&                  R)                  U[*        /U5      5        [-        UUUR/                  5       U5      $ / n[1        5       (       a  UR#                  [2        [*        /5        U(       a  [5        USS9(       ar  UR7                  [8        5        [;        XUS9(       a  UR7                  [<        5        [?        XUS9(       a  UR7                  [@        5        UR7                  [B        5        UR#                  [$        R&                  R)                  UUU5      5        U(       aH  [E        XXX5      (       a7  [G        U5      (       a'  [H        RJ                  " UUUR/                  / SQS9UUS9  U(       a;  [M        XXX5      (       a*  [N        RP                  " UUUR/                  / SQS9UU/ SQS9  [S        XQU5      (       a'  [T        RV                  " UUUR/                  5       UUSS9  [-        UUUR/                  5       U5      $ )zZ
Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
r   r   rf   )ra   rb   )scalarsr   zaten.addmm_r   r8   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sFr   r   )r8   r   r   )reorder)r   r   r8   )ra   rb   input_reorderT)ra   rb   has_bias),r>   r!   r  mulr?   addr;   r9   r    dictr   r  r	  rY   r  max_autotunemax_autotune_gemmr  r   r  r  
aten_addmmr&   r  r-   aten_bias_addmmr6   r  r  r7   r  r4   r  "addmm_contiguous_subgraph_templater1   r+   r   r  r.   r   r  r0   r   r  )rg   rh   ri   ra   rb   r   arg1arg2r   r   r   inp_expandedr+  r,  rB   r-  r  r0  s                     rR   tuned_addmmrQ  Z  s   
 $$19DTXX&t1DA:DTXX&ui.@.LMD"4.. 18C0W-A!T1&9LD #	T"Du,HM #%G ^{1#Qqc1#671<7HH\			 ))_-N-N
 't%'C
 	II**	
 )w8K8K8MvVV IK* =>)&UK,"4VD##$>?,TvN##$RS BC NN			&&}6FM
 	 A11%%66 	2	
 *6a;;** 	2#		
 V400##!	
 %T7M4G4G4I6RRrT   )r   r   c                   SSK Jn  U" XU5      u  pnU R                  5       u  pgUR                  5       u  pUR                  5       u  p[        R                  R
                  R                  Xh5      n[        R                  R
                  R                  SU-  U
5      nUc:  SSKJn  U" UR                  5       U(       a  UOUR                  5       X/US/5      nO
Ub   S5       e[        5       (       a  [        R                  XU4XCS9/O/ nX-  S:w  a:  [        XLX5      (       a)  [        S5      (       a  [         R"                  " XXU/S	S	S
9  [%        SXX4U5      $ )Nr   )r)   r   )FixedLayoutr8   z,out_dtype is ignored if layout is specified.r   sparse_semi_structured_mmTr:  ) torch._inductor.select_algorithmr)   r   r   ro   rp   check_equals_and_simplifytorch._inductor.irrS  r   rY   r-   aten__sparse_semi_structured_mmr%  r1   r+   r   r  r&   )rh   	mat1_metari   r   r   r)   m1k1m2r   k2r   r   r   rS  r  s                   rR   tuned_sparse_semi_structured_mmr^    s^    @ +4DADT]]_FB EBMMOEB	222:A	221r62>A~2OO"I(8FF	
  P"PP  !""	 ,00$' 1 	
   	

 A11 ;<<66d)4tRV	
 %#WY.Ev rT   szc                 P    [        U 5      S:H  =(       d    [        S U  5       5      $ )Nr   c              3   v   #    U  H/  n[         R                  R                  R                  US 5      v   M1     g7f)r8   Nrn   ).0ds     rR   	<genexpr>)_is_tensorwise_scaling.<locals>.<genexpr>  s-      !@B100A66s   79)r!  all)r_  s    rR   _is_tensorwise_scalingrg    s+    GqL S !@B!  rT   	transposec                 p    U(       a  SOSn[         R                  R                  R                  X   S5      $ )Nr   r   r8   rn   )r_  rh  idxs      rR   _is_rowwise_scalingrk    s*    !bC7733BGQ??rT   	tensor_sz	tile_sizec                    U(       a  SOSnU(       a  SOSn[         R                  R                  R                  X   X   5      =(       a8    [         R                  R                  R                  X   [	        X   U5      5      $ )Nr8   r   r   ro   rp   rq   r,   )r_  rl  rm  rh  lhsrhss         rR   _is_blockwise1xTILESIZE_scalingrr    si     !aC!aC7733
 
''


2
2
3rT   c                     [         R                  R                  R                  U S   [	        US   S5      5      =(       a:    [         R                  R                  R                  U S   [	        US   S5      5      $ )Nr      r8   ro  )r_  rl  s     rR   _is_blockwise128x128_scalingru  &  sd    7733
1wy|S) V
''


2
22a5')A,PS:T
UVrT   t
scale_sizescaling_typec                 Z   U=[         R                  :X  a    [        U5      $ =[         R                  :X  a    [	        X5      $ =[         R
                  :X  a    [        XR                  5       SU5      $ [         R                  :X  a  [        XR                  5       5      $  [        SU 35      e)Nrt  Unsupported scaling type )r   
TensorWiserg  RowWiserk  BlockWise1x128rr  r   BlockWise128x128ru  AssertionError)rv  rw  rx  rh  s       rR   is_desired_scalingr  ,  s     #[##)*55 [  &z=='[''2JJL#y  ))/
JJLII #<\N!KLLrT   c                 x    U =[         R                  :X  a    g[         R                  :X  a  g [        SU  S35      e)Nrt  rz  z in get_tile_size)r   r~  r}  r  )scale_options    rR   get_tile_sizer  A  s<    
)[))'' +L>9JK rT   r   r   scale_a_sizescale_b_sizec                     [          H-  u  pE[        XU5      (       d  M  [        XUSS9(       d  M*  XE4s  $    [        SU SU 35      e)NT)rh  z1Inductor Triton does not support scale_a.shape = z, scale_b.shape = )scaling_pairsr  r  )r   r   r  r  scale_option_ascale_option_bs         rR   get_scaling_optionsr  M  s^     +8&
 
 nPTUU!11	 +8 
;L>I[\h[ij rT   c	           	      l   [        XXS9u  ppp[        S   SU	 SU
 SU 3==   S-  ss'   [        R                  SU	U
UU R	                  5       UR	                  5       U5        Sn[        X5        [        X#5      u  pU(       d  XX/nO[        U5      nXXU/n[        USSUS	9n/ n/ n0 n[        5       (       a/  UR                  [        5        [        XgS
9U[        R                  '   [        U5      u  nnUR                  [        R                   :X  Ga  U(       Ga  [#        USSS9(       Ga  [        US9nUR$                  UR$                  nn['        XUU5      u  nn[)        XUS9(       a  U(       d  UR*                  US'   UR*                  US'   [-        UU[.        5      (       a)  UR                  [0        5        UU[0        R                  '   Of[-        UU[2        5      (       aE  [5        U5      US'   [5        U5      US'   UR                  [6        5        UU[6        R                  '   O[9        S5      e[;        XUS9(       a/  U(       d(  UR                  [<        5        UU[<        R                  '   [-        UU[.        5      (       a(  UR                  [>        5        UU[>        R                  '   URA                  [B        RD                  RG                  UUUUS95        UR                  [        R                   :w  a  [I        UUX5      $ U(       aF  [K        XX5      (       a5  [M        U5      (       a%  [N        RP                  " UUURS                  5       US9  U(       a7  [U        XX5      (       a&  [V        RX                  " UUURS                  5       5        [I        UUURS                  5       U5      $ )a	  
Performs an optimized matrix multiplication where scaling factors are applied
to the inputs and/or output.

Args:
    mat1 (Tensor): First input matrix
    mat2 (Tensor): Second input matrix
    scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
    scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
    bias (Tensor, optional): Optional bias tensor to add to the result
    layout: Layout hint for optimization

Returns:
    Tensor: The result of the scaled matrix multiplication
r   r   zaten._scaled_mm.default_r   r8   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s	scaled_mmr   )mat1_idxmat2_idxr   )r   use_fast_accumTF)enable_float8r   )USE_FAST_ACCUMr   SCALE_RECIPE_ASCALE_RECIPE_BTILE_SIZE_ATILE_SIZE_BzpInductor Triton does not support scaling options that are present in both epilogue scaling and main loop scalingr   )r  )-r;   r   r  r	  rY   r   r)   r    r-   r  aten__fp8_mmrH  r  r9   r   rZ   r   r6   r   r  r7   valuer5   epilogue_scaling_types.scaled_mm_device_tma_epilogue_scaling_templatemain_loop_scaling_typesr  /scaled_mm_device_tma_main_loop_scaling_templater  r4   r  r  r  r   r  r  r&   r1   r+   r   r  r  r.   r   r  )r   r   scale_ascale_bbiasscale_resultr   r  r   r   r   r   rB   scale_a_realscale_b_realr   	bias_realr-  r  r0  r   r   r,  
overridersr  r  r  r  s                               rR   tuned_scaled_mmr  ^  s   8 %,V%!A!U ^7s!A3asCDIDHHi			 DU*!/!AL \@"4(	\K #a!yM #%G IKO-,0-
(() 'v.MAz 	&duUU8
%1%7%79K9Kl)<,*
& #5vFt+9+?+?J'(+9+?+?J'(*0F  !''(VW   N R RS -0G  -:.,I
=),9.,I
=) ''(WX   O S ST %G  .e&Q##$RS JNNO 'N,B
 
 ##K0/9OKOO, NN			&&+	 	' 	
 }}%(wLL 	 A11%%66!)		
 *6a;;**7FM<O<O<QR$T7M4G4G4I6RRrT   indexc                 p    [         R                  R                  U =(       d    S5      nUR                  S:*  $ )Nr      )rZ   r   get_device_propertiesmajor)r  propss     rR   _is_sm7x_or_older_gpur    s)    JJ,,UZa8E;;!rT   c                 &    [        S U  5       5      $ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7frP   )
isinstancer   )rb  dims     rR   rd  dims_are_int.<locals>.<genexpr>	  s     4tz#s##ts   )rf  )dimss    rR   dims_are_intr    s    4t444rT   r   c           
         ^ [        XX#U5      u  p#n[        X#U/5      (       d  g [        X5      u  pU4S jnS nU" X$X0XU5      n[        UUUUTUU	S9nU
b  UR	                  XS9$ UR                  5       $ )Nc                 >  > [        5       nUR                  SU 5        UR                  SU5        UR                  SU5        UR                  SUR                  R                  SS9  UR                  SUR                  R                  SS9  [	        USU5        [	        US	U5        UR                  S
UR                  R                  5       SS9  UR                  SUR                  R                  5       SS9  TS:X  a  [        XsR                  R                  5        U$ )Nr   r   r   
mat1_dtypeT)is_categorical
mat2_dtyperh   ri   mat1_iscontigmat2_iscontigr?   )r   add_featurer   r   r	   is_contiguousr
   )	r   r   r   rh   ri   mat1_stridemat2_stridecontextrB   s	           rR   get_context%mm_autoheuristic.<locals>.get_context  s    +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7KK,=,=>rT   c                      g rP   r   r   rT   rR   fallback"mm_autoheuristic.<locals>.fallback2  s    rT   )r  r  r   r  rB   augment_contextprecondition)r   )get_size_hintsr  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rh   ri   r   r   r   r  rB   r   r   r  r   r   r  r  r  r  r  autoheuristics         `           rR   r"  r"    s     Tq1GA!q	""5dAK& !KHG0!M 55 6 
 	
 **,,rT   c                    [        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  U R                  5       [        R                  R                  R                  S9u  p$[        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  UR                  5       [        R                  R                  R                  S9u  pCX#U4$ )Nr  )r  r   r   ro   rp   
size_hintsr   rZ   r  r   unbacked_symint_fallback)rh   ri   r   r   r   s        rR   r  r  I  s    aZ3%7%7!!,,MMO__++DD - 

 aZ3%7%7!!,,MMO__++DD - 
 7NrT   c                 j   U R                   R                  nUR                   R                  nX#/n/ nU Hs  n[        U[        5      (       dJ  [        R
                  R                  R                  U[        R                  R                  R                  S9nUR                  U5        Mu     US   US   4$ )Nr  r   r8   )r   rd   r  r   r   ro   rp   r  rZ   r  r   r  r  )rh   ri   r  r  stridesstrides_hintsrd   s          rR   r  r  X  s    ++$$K++$$K(GM&#&&WW%%00//HH 1 F 	V$  ]1---rT   )rk   NrP   )F)NNNFN)NN)r   loggingtypingr   r   r   rZ   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r	   r
   r   )torch._inductor.codegen.cpp_gemm_templater   *torch._inductor.remote_gemm_autotune_cacher   torch._inductor.virtualizedr   r   "torch.fx.experimental.proxy_tensorr   torch.nn.functionalr   torch.torch_versionr    r   r  r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   irr   r   r   r   r-  r    loweringr!   r"   r#   r$   r%   select_algorithmr&   r'   r(   r)   r*   utilsr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   	mm_commonr9   r:   r;   r<   r=   r>   r  __version__triton_version
has_tritonImportError	getLoggerr   r  r  primsversionhipr  r  r  r  r  cacherS   r?   r`   r
  r   is_available	dtype_outr  rf   rK  _int_mmr>  _sparse_semi_structured_mmdefaultrX  
_scaled_mmr  r^   rj   r   rL  r   r   r  r   r   r   r  rM  r6  r@  rQ  r^  r{  r|  r}  r~  r  r  r  r{   rg  rk  r   rr  ru  Tensorr  r  tupler  r  r  r  r"  r  r  r   rT   rR   <module>r     s     ' '  ( T  F F . 6 + , > U M D E 8 8 *      !&"4"45NJ
 !yy~~		
 		!n&?  ,
 
.	/'+" ,		 :;  2@	0	 ;<2 . 3A	1	 <=3 / 2@	-	 NO2 . " " UXX|
M"	HH$yy5577=U	!!	  	KKdjjnn
 "	MM$$,,2B2B #5	$$$//77	#  "	*8K8K
8 (,11 I4 %Z6#  0  F !: ; ) D'/ #5_m#  &8*,<& "
 4775u4 u 6up 4<<T:'+ -S ;-S` 4::48*+!D sS 9sSl 422M(,T- N-b [334+--.!=!=>!;!;<!!;#=#=> &00+2E2EF &55{7S7ST s t @C @D @T @
			(+	8<			VS VS VT V 	M
MM M 	M
 
M*	3 	 ,, ,,	
 ;#$" 4??**E 
`S F`SF # 4  
5  :- C=:-z._   !'*NJs   T TT