
    ȅi	                       % S r SSKJr  SSKrSSKrSSKrSSKJr  SSKJ	r	J
r
JrJrJr  SSKrSSKJrJr  SSKJr  SSKJrJrJrJrJrJr  SS	KJrJr  SS
KJrJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.  \(       a2  SSK/J0r0  SSK1J2r2J3r3  SSK4J5r5  SSK6J7r7  SSK8J9r9  SSK:J;r;  SSK<J=r=  SSK>J?r?  \R                  " \A5      rB\R                   " S S5      5       rD\ErFS\GS'   S0S jrHS1S jrIS2S jrJ      S3S  jrK        S4S! jrL            S5S" jrM            S5S# jrN          S6S$ jrO " S% S&5      rP " S' S(\P5      rQ\R                   " S) S\D5      5       rR\R                   " S* S+\D5      5       rS\R                   " S, S-\D5      5       rT\R                   " S. S/\D5      5       rUg)7a  
This provides an abstract class which parametrizes over an "output code" concept
for Inductor.  Intuitively, this represents the compiled callable which Inductor
produces which you can call to get optimized code.  However, this callable
has some other capabilities:

- It is serializable, so you can save/load this product from disk without
  having to do compilation again.

- (When using remote cache) it is addressable, so you can save just a key
  which you can use to load this product from remote cache later.

This class is abstract because we have several different implementations of
serialized format:

- Python wrapper (the default)

- AOTInductor (this produces ABI stable binaries which work across PyTorch
  versions)

    )annotationsN)partial)AnyOptionalTYPE_CHECKING	TypeAliasUnion)countersget_runtime_metrics_context)inductor_compiled_code)BoxedDeviceIndexCudagraphCachedInfoCudagraphMetadata get_partition_cudagraph_metadataget_placeholder_info#log_cudagraph_skip_and_bump_counter)has_frozen_paramsis_frozen_param)&_unstable_customized_partition_wrapperalign_inputs_from_check_idxs	BoxedBoolCUDAGraphWrapperMetadataGraphPartitionMap	InputTypeoutput_node"set_tracing_context_output_strides)
OrderedSet)is_in_torch_dispatch_mode   )config)AutotuneCacheBundler)Counter)CallableSequence)metrics)GraphLowering)FakeScriptObject)Weights)_CompileFxKwargs)TritonBundlec                      \ rS rSr% \R
                  " SSS9rS\S'   \R
                  " SSS9rS\S'   \R
                  " SSS9r	S	\S
'   SS jr
SS jr        SS jrSS jrSrg)
OutputCodeK   NFdefaultinitOptional[str]_fx_graph_cache_keyzOptional[list[str]]_fx_graph_cache_debug_lineszOptional[int]_time_taken_nsc                *    [        [        U 5      5      eNNotImplementedErrortypeselfinputss     U/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/output_code.py__call__OutputCode.__call__X       !$t*--    c                *    [        [        U 5      5      er6   r7   r;   s    r=   prepare_for_serialization$OutputCode.prepare_for_serialization[   r@   rA   c                *    [        [        U 5      5      er6   r7   r;   example_inputs	constantsgraph_kwargss       r=   post_compileOutputCode.post_compile^   s     "$t*--rA   c                *    [        [        U 5      5      er6   r7   r;   triton_bundles     r=   set_triton_bundleOutputCode.set_triton_bundleg   r@   rA    r<   zSequence[Any]returnr   rT   NonerH   Sequence[InputType]rI   CompiledFxGraphConstantsrJ   r)   rT   rV   rO   r   rT   rV   )__name__
__module____qualname____firstlineno__dataclassesfieldr2   __annotations__r3   r4   r>   rD   rK   rP   __static_attributes__rR   rA   r=   r,   r,   K   s    
 *5):):4e)TT7B7H7H58!4 
 %0$5$5d$ONMO...+. ,. '	.
 
..rA   r,   r   _StrideExprStrc                    [        U [        R                  5      (       d  g [        U R                  5       Vs/ s H2  oR                  U5      S:X  d  M  U R                  U5      S:w  d  M0  UPM4     sn$ s  snf Nr   r   )
isinstancetorchTensorrangendimstridesize)tis     r=   get_expanded_dimsro   r   sS    a&&QVV}L}!q(8AQVVAY!^A}LLLs   A4A4+A4c                n    U H.  n[         R                  R                  R                  XSS5      n M0     U $ re   )rg   opsatenslice)rm   expanded_dimsexpanded_dims      r=   index_expanded_dimsrv   y   s-    %IINN  !Q7 &HrA   c                   [         R                  (       a  g[        U [        U 5      5      R	                  5       n [
        R                  " U 5      S:w  a  U R                  5       nU R                  n[        [        [        U5      5      5      n[        [        X5      5       VVs/ s H  u  pEUPM	     nnn[        [        U5      5       H6  nUS:X  a  SO	XUS-
        nUS:X  a  SO	X#US-
        nXU      Xx-  :  d  M6    g   gs  snnf )NTr   r   F)r    *always_complex_memory_overlap_TESTING_ONLYrv   ro   squeezerg   _debug_has_internal_overlaprk   shapelistri   lensortedzip)	rm   stridessizesindices_xrn   prev_stride	prev_sizes	            r=   complex_memory_overlapr      s    88 	A034<<>A((+q0((*uS\*+!'G(=!>?!>1!>?s7|$A Av!71q5>+BK!Vq1u~)>Iqz"[%<<	 %
  @s   $DCompiledFxGraphc                `  ^^ U R                   c   eU R                  S   nU(       a  [        R                  R                  (       ag  Uc   eUR
                  c   eU R                   m[        R                  R                  R                  UR
                  SS9mTc   eSUU4S jjnX0l         g g g )Nis_backwardF)create_if_none_existsc                4   > TR                  5         T" U 5      $ r6   )set_to_running_backward)
new_inputscompiled_graph_callablemanagers    r=   compiled_artifact;maybe_handle_backward_generation.<locals>.compiled_artifact   s    ++-*:66rA   )r   z	list[Any]rT   zCallable[..., Any])	current_callable	fx_kwargsr    tritoncudagraph_treesvaluerg   	_inductorget_manager)compiled_graphboxed_forward_device_indexr   r   r   r   s       @@r=    maybe_handle_backward_generationr      s     **666 **=9K
 v}}44)555)//;;;"0"A"A//11==&,,E > 
 """	7 	7 +<' 5{rA   c                j   [         R                  R                  (       d5  U H/  n[        U[        R
                  5      (       d  M$  [        U5        M1     U R                  S   nU R                  S   nUb>  U(       d6  U(       d.  UR                  [        [        U R                  5      5      5        g g g g )Nis_inferencer   )r    r   r   rf   rg   SymIntintr   setnextiterdevice_idxs)r   rH   r   rm   r   r   s         r=   prepare_cudagraph_post_compiler      s    
 ==((A!U\\**A   "++N;L **=9K!-l;"&&tD1K1K,L'MN LWl-rA   c                   UR                   c   eUR                  c   eUR                  nUR                  nUR                  S   nUR                  S   nU(       d  UR                  n	U	S   n
UR                  nUR
                  n[        XU5        SSKJn  UR                   nUc   eUR                  5        VVs0 s H*  u  nn[        U[        R                  5      (       d  M'  UU_M,     nnnU" UU
=(       d    S[        [        UR                  5      5      UUU[!        UR#                  5       5      U[!        UR$                  5      S9	Ul         g[&        R(                  " U5        [+        X5        S	UR,                  ;   a6  UR.                  (       a  [1        UR.                  5        g[1        S
U 35        ggs  snnf )z
Checks for any reasons not to run cudagraphs and then
runs it on compiled_graph.
Mutates the `compiled_graph.current_callable` and `cudagraphs`
Nr   r   static_input_idxsr   cudagraphifyrR   r   device_indexstack_tracesr   r   rI   placeholdersmutated_input_idxscudaskipping cudagraphs due to )r   cudagraph_infocudagraph_fail_reasonsr   r   r   r   
compile_fxr   itemsrf   rg   rh   r   r   r   tuplevaluesr   r   disabler   device_typesdisabled_cudagraphs_reasonr   )rH   r   
cudagraphsrI   r   cached_infor   r   r   r   r   r   r   r   r   kvtensor_constantss                     r=   cudagraph_post_compiler      s    **666((444 //K(??!++N;L **=9K!",,	%&9:"//"//&,F	
 	-)::+++ '__.
.TQ*Q2MDAqD. 	 
 +7/52d>#=#=>?%#%,3356%$^%F%FG
+
' 	*%(T^000 883"== 412H1IJ 1'
s   ;&F:%F:c                   UR                   c   eUR                   R                  nU(       d&  UR                  b  [        UR                  5      S:X  a"  [        R
                  " U5        [        X5        gSSKJn  UR                  c   eUR                  c   eUR                  S   nUR                  S   n[        UR                  S   =(       d    S5      n	UR                  n
[        [        UR                   5      5      nUR#                  5        VVs0 s H(  u  p[%        U[&        R(                  5      (       d  M&  X_M*     nnn[+        UR                   R,                  U	U
UR                   R.                  U5      n[1        XU5        / nUR                   H  n[3        UU5      n[5        U[7        UR8                  5      UUR.                  UU[7        UR:                  R=                  5       5      UR,                  [7        UR                  5      S	9	nUR?                  U5        M     UR                  U5        gs  snnf )
a3  
Cudagraphify each partition functions, which first prepares the necessary
metadata and then applies the cudagraphify function to each partition.

Assuming all partition functions are cudagraphified and share the same order
as `compiled_graph.partition_maps`. See [Note: Graph Partition Map for CUDAGraph].
Nr   r   r   r   r   r   rR   r   ) r   r   partition_mapsr}   r   r   r   r   r   r   recursively_apply_fnsr   r   r   r   r   r   r   rf   rg   rh   r   r   r   r   r   r   r   r   rI   r   append)rH   r   r   rI   r   r   r   r   r   r   r   r   r   r   r   graph_metadatacudagraphify_fnspartition_mappartition_metadatacudagraphify_fns                       r=    cudagraph_partition_post_compiler     s    ((444+::QQ 	((0~,,-2 	*%(T(**666//;;;!++N;L **=9K">#;#;<O#P#VTVW'::^7789L #**jELL.I*   '%%22%%22N #(B '66=

 "#$6$H$HI%+88#%.88??AB+88$%7%J%JK

 	0# 7& (()9:Os   %H:>H:c                    U (       d;  UR                   c   e[        UR                   X#5      nXAR                   La  XAl         ggg)z
Realigns input strides from inputs_to_check if
we didn't end up running cudagraphs. Mutates
`compiled_graph.current_callable` if cudagraphs
was run. Otherwise, does nothing.
N)r   r   )ran_cudagraphsr   inputs_to_checkmutated_inputs_idxsnew_callables        r=   maybe_realign_inputsr   U  sL     ..:::3++_
 >>>.:+ ? rA   c                  "    \ rS rSrSrSS jrSrg)rY   ij  a,  Wrapper class that unwraps constants from a compiled fx graph. This
version of the class only supports directly grabbing the saved constants off of
a CompiledFxGraph.

With freezing, FxGraphCache doesn't store the constants of the input
GraphModule it gets from AOTAutograd. Instead, it saves just the **names**
of those constants, and grabs the constant values directly from the graph module
passed in at runtime.

Thing is, we don't always *have* the graph module available at runtime, hence
the existence of this class and its CompiledFxGraphConstantsWithGm counterpart.

To support freezing, FXGraphCache gets passed a CompiledFxGraphConstantsWithGm during
post compile. Otherwise, CompiledFxGraphConstants supports the basic case of loading
the value of constants directly off of the original saved object.
c                T    UR                   c   e0 UR                   EUR                  E$ r6   )rI   opaque_value_type_classes)r;   gs     r=   unwrapCompiledFxGraphConstants.unwrap|  s,    {{&&&=!++=!<!<==rA   rR   Nr   r   rT   $dict[str, Union[torch.Tensor, type]])r[   r\   r]   r^   __doc__r   rb   rR   rA   r=   rY   rY   j  s    ">rA   rY   c                  ,    \ rS rSrSrSS jrSS jrSrg)	CompiledFxGraphConstantsWithGmi  a  
This version of CompiledFxGraphConstants, instead of grabbing constants
directly saved on CompiledFxGraphs, will just grab their names. Then, it takes
a second GraphModule to grab the corresponding constant values out of.

This is necessary for supporting freezing in FxGraphCache.
c                    Xl         g r6   gm)r;   r   s     r=   __init__'CompiledFxGraphConstantsWithGm.__init__  s    rA   c           	         UR                   R                  5        VVs0 s H  u  p#U[        U R                  U5      _M     nnnUR                  =(       d    0 n0 UEUEUR
                  E$ s  snnf r6   )frozen_param_namesr   getattrr   rI   r   )r;   r   name	orig_namefrozen_paramsrI   s         r=   r   %CompiledFxGraphConstantsWithGm.unwrap  su     $%#7#7#=#=#?
#? '$''9--#? 	 
 KK%2	L)L}L0K0KLL
s   #A+r   N)r   torch.fx.GraphModulerT   rV   r   )r[   r\   r]   r^   r   r   r   rb   rR   rA   r=   r   r     s    MrA   r   c                  b   \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   \R                  " S
S9rS\S'   \R                  " S
S9r	S\S'   \R                  " S
S9r
S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S'   S\S '   S!\S"'   S#\S$'   S%\S&'   S!\S''   S!\S('   S!\S)'   S*\S+'   S,\S-'   S.\S/'   S0\S1'   S2rS3\S4'   S2rS5\S6'   S
rS7\S8'      SB                                   SCS9 jjrSDS: jrSES; jr        SFS< jrSGS= jrSDS> jrSHS? jrSIS@ jrSArg2)Jr   i  zf
Class holding a compiled FX graph. This is the object serialized on disk
to support FxGraph caching.
Optional[Callable[..., Any]]r   r   Optional[Any]compiled_fn_runnerstr	cache_keyF)reprsource_coderunnable_graph_strinductor_post_grad_graph_strzOptional[list[tuple[int, str]]]cache_linemapzOrderedSet[str]r   OrderedSet[int]r   mutated_inputsr   z!Optional[dict[str, torch.Tensor]]rI   zdict[str, str]r   z3dict[str, torch._C.ScriptObject | FakeScriptObject]torchbind_constantszdict[str, type]r   z4Optional[list[Optional[tuple[_StrideExprStr, ...]]]]output_stridesr1   r   metrics.CachedMetricsDeltasmetrics_deltasCounter[str]counter_deltasguards_exprinductor_provenance_mapping_str$inductor_provenance_stack_traces_strzOptional[CudagraphCachedInfo]r   z!Optional[list[GraphPartitionMap]]r   r)   r   Sequence[int]r   NzOptional[bool]_boxed_callzOptional[TritonBundle]_triton_bundlebool_wrap_compiled_regionsc                L   Xl         Xl        Ub  UR                  OS U l        UR                  U l        UR                  (       a3  [        UR                  5       nUR                  5       U l        S S S 5        Xl        Xl	        UU l
        UU l        UR                  U l        [        UR                  5      U l        [        UR                  5      U l        [        UR                   5      U l        [        UR"                  5      U l        [%        U5      (       d  UR&                  U l        0 U l        Oo0 U l        0 U l        UR&                  R+                  5        HC  u  nn[-        U5      (       a  UR.                  U   U R(                  U'   M4  UU R&                  U'   ME     UR0                  U l        UR2                  U l        X@l        XPl        X`l        Xpl        S U l        S U l        UR@                  U l         0 U l!        SU l"        S nU(       Ga  U R6                  (       aU  SU R                  ;   a  [G        SU R6                   35        O[H        S   S==   S-  ss'   [J        RL                  " U5        GO_[O        S U	 5       5      n[P        RR                  RT                  (       d8  SS	K+J,n  U" UU R                   U R"                  U
5      nUS LnU(       a  UU l        OS nU(       + S
4U(       + S4[[        S U	 5       5      S4/n[]        U5      n[_        UR`                  5      S:X  d   eUR`                  S    Vs/ s HD  n[c        U[d        Rf                  Rh                  Rj                  5      (       a  URl                  OS PMF     nnU VVs/ s H  u  nnU(       a  M  UPM     n nn[o        [q        URr                  5      5      n![u        U!UU 5      nUU l        Xl"        Xl!        SU l;        [P        Rx                  U l=        g ! , (       d  f       GN= fs  snf s  snnf )NrR   r   r   inductorcudagraph_skipsr   c              3  z   #    U  H1  n[        U[        R                  5      (       d  M$  [        U5      v   M3     g 7fr6   )rf   rg   rh   r   .0rm   s     r=   	<genexpr>+CompiledFxGraph.__init__.<locals>.<genexpr>  s0      4+!!U\\2 .*1--+s   #;;r   )3check_for_mutation_ignore_cuda_graph_managed_tensorzmutated inputszcomplex memory overlapc              3     #    U  H>  n[        U[        R                  [        R                  [        R                  45      v   M@     g 7fr6   )rf   rg   rh   r   	Generatorr  s     r=   r	  r
  :  s4      %3 'q5<<u*WXX%3s   AAznon-Tensor inputsT)>r   r   r   r   
cache_pathopenreadr   r   r   r   r   r   r   r   r   r   r   r   rI   r   r   r   allocated_constant_namer   r   r   r   r   r   r   r   r   r   r   r   r
   r   r   anyr    r    cudagraph_support_input_mutationtorch._inductor.cudagraph_utilsr  allr   r}   argsrf   rg   fxnodeNodestack_tracer   r   graphr   r   wrap_inductor_compiled_regionsr  )"r;   r   r  r   r   r   r   r   r   rH   r   r   r   r   r   r   r   r   fr   r   r   complex_memory_overlap_inputsr  has_mutation_strhas_mutationcudagraph_testsoutputargr   bsr   r   s"                                     r=   r   CompiledFxGraph.__init__  s   ( !1"4 "- 44 	"
 e&&'1#$668  ("4,H)/N,4X1"00&u'9'9:%e&7&78()=)=>",U-E-E"F !$$"__DN&(D#DN&(D#--/1"1%%161N1Nq1QD++A.()DNN1%	 0 $)#<#< ).)H)H&,*D',,"#22!..T...75d6U6U5VW Z():;q@;!!*-03 4+4 1- }}EE
 L // 33-	 % $44#?L#:J7 $(L &%'78668PQ %3  ,
# %R6;;'1,,,  &{{1~ - )338J8J(K(KS__QUU-    9H)Q1q!&)Q$%9"((%CD!4 ,0F" -."   '-&K&K#k ('D  *Rs   P	APP %P 	
Pc                8    U R                   b  U R                   ?g g r6   )r   
partitionsrC   s    r=   __del__CompiledFxGraph.__del__Y  s!    "". ''2 /rA   c                   U R                   c   e[        R                  R                  R                  (       a  [        R                  R                  R
                  b  U R                  R                  S5      nUbX  [        R                  R                  R                  b3  [        R                  R                  R                  R                  U5      OS n[        R                  R                  R
                  R                  SU05         [        R                  R                  R                  (       a  [        R                  R                  R                  SU R                    S3SS0S9   U R                  U5      sS S S 5        [#        5       R%                  5         [&        R(                  " 5         $ U R                  U5      [#        5       R%                  5         [&        R(                  " 5         $ ! , (       d  f       O= f [#        5       R%                  5         [&        R(                  " 5         g ! [#        5       R%                  5         [&        R(                  " 5         f = f)Ngraph_id
compile_idz## Call CompiledFxGraph z ##scope
user_scope)keyword_values)r   rg   r   debugRECORD_GRAPH_EXECUTIONGRAPH_EXECUTION_ORDERr   getGRAPH_COMPILE_IDSr   autogradprofiler_is_profiler_enabled_C	_profiler_RecordFunctionFastr2   r   finishr!   end_compile)r;   r<   r,  r-  s       r=   r>   CompiledFxGraph.__call__b  s   $$000 OO!!88%%;;G~~))*5H 'OO));;G %%77;;HE 	  OO!!77>> *
	/~~&&;;XX'';;.t/G/G.HL$+\#: <   008	  ()002 ,,. ,,V4')002 ,,.   ()002 ,,. ()002 ,,.s+   A!H3 -G3>	H3 5H3 3
H=H3 3/I"c                t  ^^ [         R                  (       a  [        R                  (       a  U R                  c   eU R
                  c   e[        U R
                  R                  5      n[        U5       Vs/ s H  n[        XE5      PM     nn[        R                  mU Vs/ s H  nU4U4S jjPM     nnU R	                  U5        g[        X5        US   c   eUS   c   eUS   n	US   n
U
(       a  U R                  (       aT  SU R                  ;   a  [        SU R                   35        O[        S   S==   S	-  ss'   [        R                   " U
5        OyU	(       a  S
U;   d   eUS
   nOUR#                  S
S5      n[         R                  (       a  [%        UU U
UR'                  U 5      U5        O[)        UU U
UR'                  U 5      U5        U R*                  n[-        U
U UU R.                  5        U R0                  (       a'  U R2                  b  U R2                  mU4S jnXl        gggs  snf s  snf )a_  
Run a set of post processing steps after loading from the cache. These involve:
 - Setting the tracing context output strides
 - Running cudagraphs if enabled
 - Realigning inputs

This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
The results of this function are *not* saved in the cache itself.
Nc                   > T" X5      $ r6   rR   )r  mcustomized_wrappers     r=   <lambda>.CompiledFxGraph.post_compile.<locals>.<lambda>  s    &8&>rA   r   r   r   r   r  r  r   r   c                J   > [        5       (       a  [        TU 5      $ T" U 5      $ r6   )r   r   )r<   original_callables    r=   wrapped_callable6CompiledFxGraph.post_compile.<locals>.wrapped_callable  s%    ,..12CVLL,V44rA   )r    graph_partitionr   wrapperr   r   r}   r(  ri   r   r   r   r   r   r
   r   r   r4  r   r   r   r   r   r   r  r   )r;   rH   rI   rJ   num_partitionsrn   wrapper_metadatasmetadata!customized_wrappers_with_metadatar   r   r   r   rG  rB  rF  s                 @@r=   rK   CompiledFxGraph.post_compile  sW    !!&L&T&T--999**666 !8!8!C!CDN ~.!.A );.  ! "H!O!O !21 1H %>> 1 . 1 &&'HI*>@L)555M*666"=1 ,\ :
 ..T...75d6U6U5VW Z():;q@;!!*-7<GGG1=42. 2>1A1A4d2. )) 5&"!((.2 +&"!((.2 .. 	##		
 &&4+@+@+L $ 5 55 %5! ,M&Q!
1s   7H0"H5c                    Xl         g r6   )r   rN   s     r=   rP   !CompiledFxGraph.set_triton_bundle  s    +rA   c                .    S U l         S U l        S U l        g r6   )r   r   r   rC   s    r=   rD   )CompiledFxGraph.prepare_for_serialization  s    
 !%%)""&rA   c                    SSK Jn  SSKJnJn  U" U R
                  S5      S   nU R                  n[        R                  R                  U5      (       d  US   S==   S-  ss'   U" XES	S
9  U$ )Nr   )r
   )get_pathwrite_atomicpy   r  fxgraph_lookup_write_filer   T)	make_dirs)
torch._dynamo.utilsr
   torch._inductor.codecacherU  rV  r   r   ospathexists)r;   r
   rU  rV  artifact_pathcodes         r=   write_to_diskCompiledFxGraph.write_to_disk  sc    0D !6q9ww~~m,,Z !<=B==rA   c           	        SSK Jn  SSKJn  U R	                  5       n U" SSS9   UR                  U R                  UU R                  UR                  U 5      5      nUR                  U l
        [        USS 5      U l        [        USS 5      U l        S S S 5        U$ ! , (       d  f       U$ = f! [         a    [        R!                  S	U5        e f = f)
Nr   )dynamo_timed)PyCodeCachezPyCodeCache.load_by_key_pathT)log_pt2_compile_eventr   runnerzFailed to load artifact: %s)r[  re  r\  rf  rb  load_by_key_pathr   r   r   callr   r   r   r   OSErrorlogerror)r;   rI   re  rf  r`  
code_caches         r=   after_deserialization%CompiledFxGraph.after_deserialization  s    49**,	.&* )99NN!&&$$T*	
 )3%-4 7.* +2*h*M'& ' & 	  	II3]C	s)   B. A-BB. 
B+&B. +B. ."C)r   r   r  r   r   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNN)$r   r   r  r&   r   r   r   z*list[Optional[tuple[_StrideExprStr, ...]]]r   r1   r   r   r   r   r   r   rH   rX   r   r   r   r)   r   r   r   r   r   r   r   r   r   r1   r   r1   rT   rV   rU   rS   rW   rZ   )rT   r   )rI   rY   rT   r   )r[   r\   r]   r^   r   ra   r_   r`   r   r   r   r   r   r  r   r)  r>   rK   rP   rD   rb  ro  rb   rR   rA   r=   r   r     s<   
 3277%%N"((e4K4)//U;;(3(9(9) #  32!!  ##''00&&LL..HH --//   %22*771155"""&K&-1N*1#(D($ -19=>B%RL6RL RL !	RL
 CRL %2RL 4RL %RL RL ,RL )RL $RL 'RL  RL '*RL  *!RL" *7#RL$ /<%RL& 
'RLh3/Be5+e5 ,e5 '	e5
 
e5N,'rA   c                      \ rS rSr% SrS\S'   S\S'   SrS\S	'   \R                  " \	S
9r
S\S'   S rSS jrSS jrS r        SS jrSS jrSrg)CompiledAOTIi   z+
Class holding an AOTInductor compiled so.
z;Union[str, list[Union[str, Weights]], torch.fx.GraphModule]filenamer   device_typeNr   r   )default_factoryzdict[str, bytes]_cached_filesc                   [         R                  R                  (       d  g [        R                  R
                  R                  (       d)  [        R                  R
                  R                  (       a  g [         R                  R                  S:X  a  g [         R                  R                  (       a  g [         R                  (       d  g [        U R                  [        5      (       a  [        S U R                   5       5      nOU R                  n[        U[        R                  R                   5      (       a  Xl        g U R$                  R'                  S5      (       aB  [        R(                  R*                  R-                  USU R$                  SS5      R.                  nO]U R$                  S:X  a5  [        R(                  R*                  R1                  US5      R.                  nO[3        SU R$                   35      eXl        SU l        U R6                   H\  n[8        R:                  R=                  U5      (       a  M)  [?        US	5       nURA                  U R6                  U   5        S S S 5        M^     g ! , (       d  f       Mp  = f)
Nwindowsc              3     #    U  H6  n[        U[        5      (       d  M  UR                  S 5      (       d  M2  Uv   M8     g7f)z.soN)rf   r   endswith)r  fns     r=   r	  -CompiledAOTI.__post_init__.<locals>.<genexpr>?  s,      $*rjS.AbkkRWFX]s   A A 	A r   r    Tcpuzunsupported device type wb)!r    aot_inductorlink_libtorchrg   r   cpp_builder	_IS_MACOS_IS_WINDOWScross_target_platformpackage_cpp_onlyenable_autograd_for_aotrf   rs  r|   r   r  GraphModuler   rt  
startswithr9  _aotiAOTIModelContainerRunnerCudarunAOTIModelContainerRunnerCpuRuntimeErrorr   rv  r]  r^  r_  r  write)r;   r   filer  s       r=   __post_init__CompiledAOTI.__post_init__+  s   ""00 OO''11**6644	A//--dmmT**# $!]]$    $}}&(<(<==$4!&&v..;;$$$ #  &::$a#  !9$:J:J9KLMM 0&&D77>>$''$%GGD..t45 &% '%%s   I::
J		c                T    U R                   c  [        S5      eU R                  U5      $ )Nz%AOTInductor compiled so is not loaded)r   r  r:   s     r=   r>   CompiledAOTI.__call__c  s+      (FGG$$V,,rA   c                n   S U l         0 U l        / n[        U R                  [        5      (       a  U R                  nO,[        U R                  [
        5      (       a  U R                  /nU H5  n[        US5       nUR                  5       U R                  U'   S S S 5        M7     g ! , (       d  f       MI  = f)Nrb)r   rv  rf   rs  r|   r   r  r  )r;   	filenamesr   r  s       r=   rD   &CompiledAOTI.prepare_for_serializationh  s     $!	dmmT**Is++IDdD!Q+,668""4( "! !!s   :B%%
B4	c                D    U R                   R                  5       nS US'   U$ )Nr   )__dict__copy)r;   states     r=   __getstate__CompiledAOTI.__getstate__t  s$    ""$$( !rA   c                @    U R                   c  U R                  5         g g r6   )r   r  rG   s       r=   rK   CompiledAOTI.post_compiley  s!       (  )rA   c                    g r6   rR   rN   s     r=   rP   CompiledAOTI.set_triton_bundle      rA   )r   rv  r   rS   rU   rW   rZ   )r[   r\   r]   r^   r   ra   r   r_   r`   dictrv  r  r>   rD   r  rK   rP   rb   rR   rA   r=   rr  rr     s}     JI5929&1&7&7&MM#M66p-

4
!+! ,! '	!
 
!rA   rr  c                  \    \ rS rSr% SrS\S'   S
S jr        SS jrSS jrSS jr	S	r
g)MockFXGraphCacheOutputi  Nr   r   c                    SU l         g )NTr   rC   s    r=   r  $MockFXGraphCacheOutput.__post_init__  s
    rA   c                    g r6   rR   rG   s       r=   rK   #MockFXGraphCacheOutput.post_compile  s     	rA   c                $    U R                  U5      $ r6   r   r:   s     r=   r>   MockFXGraphCacheOutput.__call__  s    wwvrA   c                    g r6   rR   rN   s     r=   rP   (MockFXGraphCacheOutput.set_triton_bundle  r  rA   r  rU   rW   rS   rZ   )r[   r\   r]   r^   r   ra   r  rK   r>   rP   rb   rR   rA   r=   r  r    sE    BN + , '	
 
rA   r  c                     ^  \ rS rSr% Sr\R                  " SSS9rS\S'   \R                  " SSS9r	S\S	'   SU 4S
 jjr
SS jr\S 5       r  SS jr        SS jrSS jrSS jrSrU =r$ )RegionalOutputCodei  a  
OutputCode for regional inductor compilation results.

Regional inductor returns a torch.fx.GraphModule that contains both
compiled regions (via standalone_compile) and eager regions. This needs
special serialization using GraphPickler instead of standard pickle.

The serialization strategy stores the GraphModule as bytes using
GraphPickler.dumps(), which handles FakeTensors, AOTCompiledArtifacts,
and other special objects that standard pickle cannot handle.
NFr.   zOptional[bytes]_serialized_graph_modulezOptional[torch.nn.Module]_graph_modulec                
  > [         TU ]  5         Xl        SU l        / U l        SU l        U R                  5       u  p#[        UR                  R                  [        R                  R                  R                  5      U l        g)zP
Args:
    graph_module: The torch.fx.GraphModule returned by regional_inductor
NT)superr   r  r  _serialized_wrappersr   _unwrap_graph_modulerf   r  _codegenrg   r  _BoxedCodeGen_inner_boxed_call)r;   graph_moduler   module	__class__s       r=   r   RegionalOutputCode.__init__  sh    
 	)(,%$&!--/	!+LL!!588>>#?#?"
rA   c                    U R                   c  [        S5      eU R                  (       a  U R                  U5      $ U R                   " U6 $ )z$Execute the regional compiled graph.zURegionalOutputCode has no graph module loaded. Did you forget to call post_compile()?)r  r  r  r:   s     r=   r>   RegionalOutputCode.__call__  sM    %9 
 !!%%f--!!6**rA   c                >    U R                  5       u  pUR                  $ r6   )r  r  )r;   r   r  s      r=   r  RegionalOutputCode.graph  s    --/	||rA   c                   U R                   n/ n[        U[        R                  R                  5      (       a  UR
                  n[        U[        R                  R                  R                  5      (       d   eUR                  [        R                  R                  UR                  UR                  S.45        UR                  n[        U[        R                  R                  5      (       d   eX!4$ )N)reasonwrapping)r  rf   rg   _dynamoOptimizedModule
dynamo_ctx
eval_frameDisableContextr   r   msgr  	_orig_modr  r  )r;   r  serialized_wrappersr  s       r=   r  'RegionalOutputCode._unwrap_graph_module  s     ## femm;;<<**Jj%--*B*B*Q*QRRRR&&MM)))~~:;N;NO %%F&%(("6"67777"**rA   c                   U R                   b  gU R                  c   eSSKJn  U" U5      nUc  [	        S5      eSSKJn  UR                  U R                  U5      n[        U[        R                  R                  5      (       d   eUR                  5         [        U R                  5       H  u  pU" U40 U	D6nM     Xpl         g)z
Post-compile processing for regional inductor.

This deserializes the GraphModule from bytes using GraphPickler,
extracting the fake_mode from example_inputs.
Nr   )detect_fake_modeziCould not detect fake mode from example inputs. Regional inductor requires fake mode for deserialization.)GraphPickler)r  r  torch._guardsr  r  torch.fx._graph_picklerr  loadsrf   rg   r  r  	recompilereversedr  )
r;   rH   rI   rJ   r  	fake_moder  r   r{  kwargss
             r=   rK   RegionalOutputCode.post_compile  s     ),,8882$^4	L  	9 = =yI"ehh223333
"4#<#<=JBB!&!B >rA   c                    g)z6Regional inductor doesn't use triton bundles directly.NrR   rN   s     r=   rP   $RegionalOutputCode.set_triton_bundle	  s    rA   c                    U R                   bI  SSKJnJn  SU l        U R                  5       u  U l        nUR                  UU" SSS9S9U l        SU l         gg)z
Prepare for serialization by converting the GraphModule to bytes.

This uses GraphPickler to serialize the graph module since it contains
special objects like FakeTensors and AOTCompiledArtifacts that need
custom pickling.
Nr   )r  Options)source_fn_stacknn_module_stackfwd_source_fn_stack)
ops_filterignore_metadata_fields)options)r  r  r  r  r  r  r  dumps)r;   r  r  r  s       r=   rD   ,RegionalOutputCode.prepare_for_serialization  sk     )E,0D)6:6O6O6Q3D%|,8,>,>#, -? 
-D) "&D% *rA   )r   r  r  r  r  )r  ztorch.nn.ModulerS   )rT   zBtuple[list[tuple[Callable, dict[str, Any]]], torch.fx.GraphModule]rW   rZ   rU   )r[   r\   r]   r^   r   r_   r`   r  ra   r  r   r>   propertyr  r  rK   rP   rD   rb   __classcell__)r  s   @r=   r  r    s    
 1<0A0A51o 
 0;/@/@50M, 

+  +	K+$! +!  ,!  '	! 
 
! FE& &rA   r  )rm   torch.TensorrT   	list[int])rm   r  rt   r  rT   r  )rm   r  rT   r  )r   r   r   Optional[BoxedDeviceIndex]rT   rV   )r   r   rH   rX   r   r  rT   rV   )rH   rX   r   r   r   r   rI   r   r   r  rT   rV   )
r   r   r   r   r   r   r   r   rT   rV   )Vr   
__future__r   r_   loggingr]  	functoolsr   typingr   r   r   r   r	   rg   r[  r
   r   torch._higher_order_ops.wrapr   r  r   r   r   r   r   r   torch._inductor.freezing_utilsr   r   torch._inductor.utilsr   r   r   r   r   r   r   r   torch.utils._ordered_setr   torch.utils._python_dispatchr   r}  r    runtime.autotune_cacher!   collectionsr"   collections.abcr#   r$   torch._inductorr%   torch._inductor.graphr&   "torch._library.fake_class_registryr'   )torch.export.pt2_archive._package_weightsr(   r   r)   triton_bundlerr*   	getLoggerr[   rl  	dataclassr,   r   rc   ra   ro   rv   r   r   r   r   r   r   rY   r   r   rr  r  r  rR   rA   r=   <module>r     sq  , #   	  A A  E ?  N	 	 	 0 B  8 #2'3CA,,! . . .>  	 M,<#< :< 
<8O#O'O !;O 
	O"@'@#@ @ 4	@
 !;@ 
@FM;'M;#M; M; 4	M;
 !;M; 
M;`;;#; #; )	;
 
;*> >.M%= M* Fj F FR b: b bJ Z  * I& I& I&rA   