
    ȅiG                     f   S SK r S SKrS SKJr  S SKJr  S SKrS SKrS SKJ	s  J
r
  S SKJr  S SKJr  S SKJrJrJrJrJrJrJr  S SKJr  S SKJr  S S	KJr  S S
KJr  \R>                  " \ 5      r!S\RD                  RF                  S\$\   S\%S\4S jr& " S S\RN                  5      r( " S S\5      r)g)    N)Callable)Any)ir)KernelTemplate)BufferFixedLayoutget_free_symbolsget_symbolic_inputsgm_original_output_stridesir_node_to_tensorLayout)benchmarker)do_bench_using_profilingV)
OrderedSetgminputsnamereturnc                     SSK Jn  [        R                  R                  n U [        R                  l        U" X5      U[        R                  l        $ ! U[        R                  l        f = f)a  Inline a subgraph by converting its FX operations to individual IR nodes.

This converts a subgraph to multiple ComputedBuffer nodes (fusable),
enabling epilogue fusion with subsequent operations.

Returns:
    TensorBox containing the final operation result as individual IR nodes
r   )process_subgraph_nodes)torch._inductor.loweringr   r   graphmodule)r   r   r   r   original_modules        Z/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/codegen/subgraph.pyinline_subgraph_to_ir_nodesr      sF     @ ggnnO)%b1(s   A A+c                     ^  \ rS rSrSr SS\S\\   S\S\S\	S	\
4   S
\\\	\
/\R                  4   4   S-  SS4U 4S jjjrS\\   4S jrS\	S	\
4   S\\\
4   SS4S jrS\4S jrS\
4S jrS\\
   S\R                  S\4S jrS\\
   S\R                  SS4S jrS\4S jrS\R2                  4S jrS\\\
4   4S jrS\4S jrSrU =r$ )SubgraphChoiceCaller4   z
Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
GraphModule. Compiles the Subgraph down to a module for benchmarking.
Nr   input_nodeslayoutdescriptionmake_fx_graph.input_gen_fnsr   c           	        > [         T	U ]  XX45        / U l        [        R                     [        U R                  5       H  u  px[        [        UR                  5       SS95      S:X  d   e[        [        UR                  5       SS95      S:X  d   eUR                  R                  5         Ub*  Xv;   a%  U R                  R                  Xg   " U5      5        M  U R                  R                  [        U5      5        M     S S S 5        U" U R                  6 U l        [!        U R                  5        [#        U R                  5      U l        U R'                  5       U l        S U l        0 U l        S U l        g ! , (       d  f       N|= f)NT)unbacked_onlyr   )super__init__example_inputsr   	fake_mode	enumerater"   lenr	   get_size
get_stridedatafreeze_layoutappendr   r   r   r
   
sym_inputs_compute_sym_input_valuessym_input_valuesdecompositiondecomposition_kwargs_compiled_module)
selfr   r"   r#   r$   r%   r&   iinp	__class__s
            r   r*   SubgraphChoiceCaller.__init__:   s<    	F@ [[#D$4$45+CLLN$OPTUUUU+CNN,<DQRVWWWW&&( !,1C''..}/?/DE''../@/EF 6   !4!45"477+-d.>.>? $ > > @ 9=46!%)3 [s   CE77
Fc           	         [        U R                   Vs/ s H"  n[        US5      (       d  M  UR                  PM$     sn5      n0 n[	        U R
                  U R                  5       H  u  pE[        U[        R                  5      (       d  M&  [	        UR                  5       UR                  5       He  u  pg[        U[        R                  5      (       a  [        U5      X6R                  '   M=  [        U5      U;   d  MN  [        U5      U[        U5      '   Mg     M     / nU R                   H  n	[        U	[        R                  5      (       a/  U	R                  U;   a  UR!                  X9R                     5        MQ  ["        R$                  R&                  R(                  R+                  U	5      n
UR!                  U
b  [        U
5      OS5        M     U$ s  snf )a  Extract concrete dimension values for sym_inputs from example_inputs.

The compiled module expects symbolic dimension values as runtime arguments.
This maps each symbolic variable to its concrete value from the example tensors.
Used for range based autotuning.
r      )r   r4   hasattrr   zipr"   r+   
isinstancetorchTensorr/   shapesympySymbolintstrr3   r   r   sizevars	shape_env	size_hint)r:   ssym_input_namessym_name_to_valueinp_nodeexample_inpsym_dim
actual_dimresultsym_varhints              r   r5   .SubgraphChoiceCaller._compute_sym_input_valuesa   s\    %!__C_60BVQVV_C

 -/%()9)94;N;N%O!H+u||44+.x/@/@/BKDUDU+V'G!'5<<88:=j/),,7W8:=j/)#g,7	 ,W &P G'5<<00W\\EV5V/=>ww''11;;GD4+;c$iC ' ) Ds
   GGr7   kwargsc                     Xl         X l        g)zHCache decomposition function and kwargs for range-based dispatch lookup.N)r7   r8   )r:   r7   rY   s      r   cache_decomposition(SubgraphChoiceCaller.cache_decomposition   s     +$*!    c                 "    SU R                    S3$ )NzSubgraphCaller()r   r:   s    r   __str__SubgraphChoiceCaller.__str__   s     1--r]   c                    SSK Jn  U R                  R                  SS5      R                  SS5      nU" U R                  U R
                  [        R                  R                  [        R                  R                  [        R                  R                  [        R                  R                  [        R                  R                  [        R                  R                  SU 3S9	nU R                   H@  nXCR                  UR                  '   UR                   R#                  UR                  5        MB     [        R$                  " U5         [&        R(                  " SSS	S
9   UR*                  " U R
                  6   UR-                  5       sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zCCompile the subgraph for benchmarking, returns the compiled module.r   )GraphLoweringz::_.
benchmark_)	r   r+   rL   cpp_wrapperaot_modeextern_node_serializeris_inferenceis_backwardr   FATEN)max_autotunemax_autotune_gemmmax_autotune_gemm_backendsN)torch._inductor.graphre   r   replacer   r+   r   r   
_shape_envri   rj   rk   rl   rm   r4   graph_inputsgraph_input_namesr3   set_graph_handlerconfigpatchruncompile_to_module)r:   re   	safe_namebm_graph_loweringsym_inps        r   _compile_for_benchmarking.SubgraphChoiceCaller._compile_for_benchmarking   sK   7II%%dC088cB	)ww..gg((++WW%%#$77#A#A--++i[)

 G;B**7<<8//66w||D '   !23""'+1
 "%%t':':;(::<  43   433s$   F<%)F!	F<!
F/	+F<<
G
argsoutc                >  ^^^ U R                   c  U R                  5       U l         U R                   R                  mU R                  m[        R
                  (       a  [        UUU4S j5      $ [        R                  " UUU4S j[        R                  " / TQTQ76 S9$ )zBRegular benchmarking: compile and use benchmarker with warmup/rep.c                     > T" / TQT Q5      $ N r   bm_funcr4   s   r   <lambda>0SubgraphChoiceCaller.benchmark.<locals>.<lambda>   s    G<Pj<P4<P4Qr]   c                     > T" / TQT Q5      $ r   r   r   s   r   r   r      s    G0j0401r]   )device)
r9   r   callr6   rx   /profile_bandwidth_with_do_bench_using_profilingr   r   	benchmarkinfer_device)r:   r   r   r   r4   s     `@@r   r   SubgraphChoiceCaller.benchmark   s}      ($($B$B$DD!'',,**
AA+,QRR$$1++?Z?$?
 	
r]   c                    U R                   c  U R                  5       U l         U R                   R                  / U R                  QUQ5        g)zFRun once for collective benchmarking (barrier sync handled by caller).N)r9   r   r   r6   )r:   r   r   s      r   benchmark_collective)SubgraphChoiceCaller.benchmark_collective   sD      ($($B$B$DD!""#BT%:%:#BT#BCr]   c           
         SR                  U R                  R                  SS5      S   /U R                   Vs/ s H  n[	        UR                  5       5      PM     snQU R                   Vs/ s H  n[	        UR                  5       5      PM     snQ[	        U R                  R                  5      P5      $ s  snf s  snf )N-rf   r@   r   )	joinr   rsplitr"   rJ   r/   r0   r   r   )r:   r<   s     r   hash_keySubgraphChoiceCaller.hash_key   s    xx		  a(+151A1AB1A##clln%1AB 483C3CD3CC#cnn&'3CD DGGMM"	
 	
 CDs   #B7
-#B<
c           
          [         R                  R                  [         R                  " U R                  U R
                  U R                  U R                  U R                  S95      $ )N)r#   r"   r   r+   subgraph_name)	r   	TensorBoxcreateSubgraphBufferr#   r"   r   r+   r   ra   s    r   output_node SubgraphChoiceCaller.output_node   sN    ||""{{ ,,77#22"ii
 	
r]   c                      SU R                   S.$ )zRInformation returned here is logged to the autotune log file when that is enabled.subgraph)backendkernel_namer`   ra   s    r   	info_dictSubgraphChoiceCaller.info_dict   s     "99
 	
r]   c                      SU R                    3$ )N	subgraph_r`   ra   s    r   autoheuristic_id%SubgraphChoiceCaller.autoheuristic_id   s    499+&&r]   )r9   r7   r8   r+   r   r6   r4   r   )__name__
__module____qualname____firstlineno____doc__rJ   listr   r   r   r   dictrI   rD   rE   r*   r5   r[   rb   r   floatr   r   r   r   r   r   r   r   __static_attributes____classcell__r=   s   @r   r    r    4   sj    JN%*%* &\%* 	%*
 %*  S)%* C3%*=!>>?$F%* 
%* %*N49 <+%c3h/+9=c3h+	+. .=3 =@
tCy 
u|| 
 
D$s) D%,, D4 D
# 
	
R\\ 	

4S> 
'# ' 'r]   r    c                   \  ^  \ rS rSrSr\R                  " 5       rS\4U 4S jjr	  SS\S\
\   S\S\S	\4   S
\S\\\\/\R$                  4   4   S-  S\S\4S jjr  SS\S\
\S	\4      S\
\   S\
\\\4      S\S	\4   S-  S\\\\/\R$                  4   4   S-  S\
\   4S jjrS\S	\4   S\\\4   S\4S jrS\\\4   SS4S jrS\S\
\S	\4      S\
\   SS4S jr  SS\
\   S\S	\4   S\\\4   S\S	\4   S-  S\\\\/\R$                  4   4   S-  S\4S jjrSrU =r$ )SubgraphTemplate   z
A template for subgraph evaluation to be used in autotuning.

This class allows creating customized subgraphs that can be appended
as choices during the autotuning process, enabling the selection of
optimal implementations for complex operations.
r   c                     > [         TU ]  US9  g)zd
Initialize a subgraph template.

Args:
    name: The name of this template
    graph: The FX graph
r`   N)r)   r*   )r:   r   r=   s     r   r*   SubgraphTemplate.__init__   s     	d#r]   Nr"   r#   r%   .r$   r&   rY   r   c           	      V    [        U S[        [        R                  5       3UUUUUS9$ )a!  
Generate a SubgraphChoiceCaller instance for autotuning.

Args:
    name: The name for this subgraph choice
    input_nodes: List of input nodes to the subgraph
    layout: Memory layout information for the output
    make_fx_graph: Callable that creates the FX graph for this subgraph
    description: Optional description of this choice
    input_gen_fns: Optional dict mapping input indices to tensor generators
    **kwargs: Additional keyword arguments

Returns:
    SubgraphChoiceCaller: A callable object that can be used for autotuning
rf   )r   r"   r#   r$   r%   r&   )r    nextr   index_counter)r:   r   r"   r#   r%   r$   r&   rY   s           r   generateSubgraphTemplate.generate   s;    4 $64 0 > >?@A##''
 	
r]   decompositionsnon_tensor_argsdefault_implc                   ^ U(       d  / $ [        U5      [        U5      :X  d    S[        U5       S[        U5       S35       e[        X$5       VVs/ s H  u  pxU R                  X7XU5      PM     n	nnU R                  XU	5        U	S   n
/ n[        X$5       H  u  p|SSKmUUS.S[
        S[        S	[
        4   S
[        [        [
        4   S[
        4U4S jjjnU R                  X|5      nU R                  U SU 3UU
USUR                   3US9nUR                  X|5        UR                  U5        M     U$ s  snnf )a  
Generate multiple SubgraphChoiceCaller instances for custom op autotuning.

This method extends SubgraphTemplate to support custom op decompositions,
allowing multiple implementations to compete in autotuning.

Args:
    name: Base name for the choices
    decompositions: List of decomposition functions to compete in autotuning
    input_nodes: List of tensor inputs. All tensor arguments must be passed here.
    non_tensor_args: List of non-tensor kwargs only, one dict per corresponding decomposition.
    default_impl: Default implementation for layout inference
    input_gen_fns: Optional dict mapping input indices to tensor generators

Returns:
    List of SubgraphChoiceCaller instances for autotuning
z>decompositions and non_tensor_args must have same length, got z decompositions and z kwargsr   N)decompdecomp_kwargsr   r   .r   r   c                 `   > SSK Jn  SSKJn  U" 5       nU" TR                  " U 40 UD6US9" U6 $ )Nr   )make_fx   )select_decomp_table)decomposition_table)"torch.fx.experimental.proxy_tensorr   r7   r   partial)r   r   r   r   r   r   	functoolss         r   r%   BSubgraphTemplate.generate_custom_op_choices.<locals>.make_fx_graphM  sB     G?&9&;#%%f>>(;  r]   rf   z	CustomOp )r   r"   r#   r%   r$   r&   )r.   rB   _infer_custom_op_layout_validate_layout_equivalencer   r   r   r   rJ   _generate_variant_namer   r   r[   r3   )r:   r   r   r"   r   r   r&   r   rY   layoutsr#   choicesr   r%   variant_namechoicer   s                   @r   generate_custom_op_choices+SubgraphTemplate.generate_custom_op_choices  s   4 I>"c/&:: 	
~&'';C<P;QQXZ	
: #&n"F	
 #G ((V= #G	 	 
 	))$H.0%(%I!F .40= c*  $CH~ 	 $  66vML]]vQ|n-'+''89+ # F &&v=NN6"E &JH a
s   D:r   c                     UR                   nU(       d  U$ SR                  S [        UR                  5       5       5       5      nU SU 3$ )zLGenerate a descriptive name for a decomposition variant with its parameters.rf   c              3   4   #    U  H  u  pU S U 3v   M     g7f)rf   Nr   ).0kvs      r   	<genexpr>:SubgraphTemplate._generate_variant_name.<locals>.<genexpr>v  s     N7Mtq1#Qqc
7Ms   )r   r   sorteditems)r:   r   rY   	base_nameparam_suffixs        r   r   'SubgraphTemplate._generate_variant_nameo  sF     OO	xxNvflln7MNNAl^,,r]   c                     UR                  5        HA  u  p#[        U[        R                  [        45      (       d  M,   SU S[        U5       S35       e   g)z8Validate that kwargs contains only non-tensor arguments.zkwargs['z'] contains tensor zo. Tensor arguments should be in input_nodes, not kwargs. Only scalar/non-tensor parameters should be in kwargs.N)r   rC   rD   rE   r   type)r:   rY   keyvalues       r   _validate_non_tensor_kwargs,SubgraphTemplate._validate_non_tensor_kwargsy  sV     ,,.JC!%%,,)?@@ 3%24;- @I J@ )r]   op_namer   c                 2   U(       d  gUS   n[        USS SS9 H  u  pVUR                  UR                  UR                  UR                  4UR                  UR                  UR                  UR                  4:w  d  Me  [        SU SX%   R                   SUR                   SUR                   SUR                   SUR                   S	US   R                   SUR                   SUR                   SUR                   SUR                   S
35      e   g)zXEnsure all layouts have consistent stride, device, dtype, and sizes for fair autotuning.Nr   r@   )startzLayout mismatch in custom op 'z': decomposition 'z' produces (z, z) but 'r_   )r-   r   dtypesizestrideAssertionErrorr   )r:   r   r   r   	referencer;   r#   s          r   r   -SubgraphTemplate._validate_layout_equivalence  s    AJ	"712;a8IAv||V[[&--H    	M  %4WI >&&4&7&@&@%A BbbR W*1-667 8!(()IOO+<By~~>NbQZQaQaPbbc	e  9r]   function_decompositionc           	      v   SSK nSSKJn  U R                  U5        UR                     / n[        U5       H  u  pU(       a  X;   a  XY   " U
5      nOU
R                  5       nUR                  R                  R                  U[        R                  S9nU
R                  5       nUR                  R                  R                  U[        R                  S9n[        R                  " UUU
R                  5       U
R!                  5       S9nUR#                  U5        M     UR$                  " U40 UD6nU" U6 n['        U[        R(                  5      (       d   S[+        U5       S35       e[-        UR.                  UR0                  UR2                  UR5                  5       S9sSSS5        $ ! , (       d  f       g= f)	zInfer output layout for custom ops using the default implementation when available.
Note that the Subgraph assumes custom ops return exactly one tensor output.
TODO: Add support for multiple output custom ops.
r   Nr   )fallback)r   r   z#Expected single tensor output, got z:. Multi-output custom ops not yet supported in autotuning.)r   r   r   r   )r   torch._inductor.virtualizedr   r   r,   r-   r/   r   rK   
size_hintsrx   unbacked_symint_fallbackr0   rD   empty_strided	get_dtype
get_devicer3   r   rC   rE   r   r   r   r   rF   r   )r:   r"   r   rY   r   r&   r   r   r+   r;   r<   fake_tensor	raw_shapeconcrete_shape
raw_strideconcrete_stridefnoutputs                     r   r   (SubgraphTemplate._infer_custom_op_layout  s    	1 	((0[[N#K0 Q%7"/"23"7K #I%&WW%5%5%@%@!F,K,K &A &N "%!1J&'gg&6&6&A&A"V-L-L 'B 'O #("5"5&'!mmo"~~/	#K %%k2% 1( ""#9DVDB(F fell33 5d6l^ DK L3
 }}ll\\}}	? [[s   E8F**
F8r   ) N)NN)r   r   r   r   r   	itertoolscountr   rJ   r*   r   r   r   r   r   r   rI   rD   rE   r    r   r   r   r   r   r   r   r   r   s   @r   r   r      sk    OO%M$$& IM!
!
 &\!
 	!

  S)!
 !
 C3%*=!>>?$F!
 !
 
!
R 37IMSS Xc3h/0S &\	S
 d38n-S sCx(4/S C3%*=!>>?$FS 
"	#Sj-sCx(-26sCx.-	-$sCx. T  Xc3h/0 f	
 
> 37IM7&\7 !)c 27 S#X	7
 sCx(4/7 C3%*=!>>?$F7 
7 7r]   r   )*r  loggingcollections.abcr   typingr   rG   rD   torch._inductor.config	_inductorrx   torch._inductorr   torch._inductor.codegen.commonr   torch._inductor.irr   r   r	   r
   r   r   r   $torch._inductor.runtime.benchmarkingr   torch._inductor.utilsr   r   r   torch.utils._ordered_setr   	getLoggerr   logfxGraphModuler   rJ   r   ChoiceCallerr    r   r   r]   r   <module>r     s      $    ' '  9   = : ) / !))&*3i)7:)),h'2?? h'Vt~ tr]   