
    ȅiR(                      % S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJrJrJrJrJ r   S SK!J!r!  S SKJ"r"  S SK	J#r#  S SK$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3J4r4J5r5  S S	KJ6r6  S SK7r7S SK8r8S SK9J:s  J;r<  S S
K=J>r>  S SK?J@r@  S SKAJBrB  S SKCJDrD  S SKEJFrF  S SK9JGrGJHrH  SS/rIS SKJJKrKJLrLJMrMJNrN  \-(       ah  S SKJOrOJPrPJQrQ  S SKRJSrS  S SK8JTrTJUrUJVrV  S SKWJXrX  S SKYJZrZ  S SK[J\r\  S SK]J^r^  SSK_J`r`  SSKaJbrb  SSKcJdrd  SSKeJfrf  SSKgJhrhJiriJjrjJkrkJlrlJmrm  SS KnJoro  SS!KpJqrqJrrr  / S"Qrs\0" S#5      rt\R                  GS=S$ j5       rvS S%KwJxrx  S S&KyJzrz  S S'K{J|r|  S S(K}J~r~  S S)KJr  S S*KJr  S S+KJrJrJrJrJr  S S,KJrJr  S S-KJrJr  SS.KJr  SS/KJr  \R                  S0:H  r\GR*                  " \5      r\0" S15      r\\7GR4                  \7GR4                  4   r\+\1\8GR8                  \\8R                  4      r\(       a  S2O\GR>                  " S3S45      rS5S6\ 3S7.rS8rS8rS8rS9rS:r\\S-
  -  S :X  a  \S;:  d   S<5       eGS>S= jrGS?S> jr " S? S@\7GRR                  5      r\GRV                  " SASB9 " SC SD5      5       rGS@GSASE jjr   GSB         GSCSG jjr   GSB         GSCSH jjr\R                  GSDSI j5       rGSESJ jrGSFSK jrGSGSL jrGSHSM jr      GSISN jrGSJSO jr    GSKSP jrGSLSQ jrGSMSR jr    GSNSS jrGSOST jrSU 4     GSPSV jjr        GSQSX jrGSRGSSSY jjr  GST         GSUSZ jjr     GSV             GSWS[ jjrGSXS\ jrGSYS] jrGSZS^ jrGS[S_ jrGS\S` jr\4" Sa5      r\0" SbSASc9r\\'\%\4   \4   r " Sd Se\,\(\\4   5      rGS]Sf jr    GS]Sg jr    GS^Sh jr    GS_Si jr      GS`Sj jr      GSaSk jr GSb     GScSl jjr      GSdSm jrGSeSn jrGSfSo jrGSgSp jrGShSq jrGSiSr jrGSjSs jrGSkSt jrGSlSu jrGSmSv jr\" / SwQ5      r    GSnSx jrGSoSy jrGSpSz jrS SKrGSqS{ jr/ rSW\S|'   GSrS} jrGSqS~ jr\GR                     GSs       GStS jj5       r\r\r\rSFS.GSuS jjrSFS.       GSvS jjr\RD                  " S;5      GSwS j5       r " S S\*5      r\GRV                   " S S5      5       r " S S5      r " S S\5      r\GR                  GSxS j5       r " S S5      r " S S\5      r\R                  GSyGSzS jj5       r\RD                  GS{S j5       r\RD                  GSDS j5       rGS{S jr GSb       GS|S jjr      GS}S jrGS~S jrGS~S jrSFSFSAS.         GSS jjrSSFS.       GSS jjrSFS.       GSS jjrSFS.       GSS jjr        GSS jGr \RD                  " SS9GSDS j5       Gr\RD                  " SS9GSDS j5       Gr\RD                  " SS9GSDS j5       Gr                  GSS jGrGSS jGr              GSS jGrGSS jGr\1\\7GR4                  4   GrS\S'   \R                   GS         GSS jj5       Gr	\R                  GSS j5       Gr
\R                  GSS j5       Gr\R                  GSS j5       Gr\R                  GSS j5       GrGSS jGrGSS jGrGSS jGrGSS jGrGSS jGr        GSS jGr    GS               GSS jjGrGSDS jGr " S S5      Gr        GSS jGr        GSS jGrGSS jGrGSS jGrGSS jGr        GSS jGr        GSS jGr\GR                        GSS j5       Gr GSb     GSS jjGrGSS jGr GSS jGr!GSS jGr"GSS jGr#GSS jGr$GSS jGr%\GR                  GSS j5       Gr&GS{S jGr'\R                  GS{S j5       Gr(\R                  GSS j5       Gr)\R                  GS{S j5       Gr*GS{S jGr+GS{S jGr,GSS jGr-GSS jGr.GSDS jGr/GSDS jGr0GSS jGr1GSmS jGr2 " S S\GRf                  5      Gr4          GSS jGr5GSS jGr6    GSS jGr7 GSb     GSS jjGr8GSS jGr9 GSb     GSS jjGr:GSS jGr;      GSS jGr<        GSS jGr=S 4           GSS jjGr>S 4           GSS jjGr?GSS jGr@GSS jGrA\GRV                   " S S5      5       GrB\GR                  GSS j5       GrCGSS jGrDGSS jGrEGSS jGrFGSS jGrG              GSS jGrHGSS jGrIGSS jGrJGSS jGrKGSS jGrL        GSS jGrMGSS jGrN        GSS jGrOGSS jGrP GSb       GSS jjGrQ      GSS jGrRGSS jGrS      GSS jGrTGSDS jGrUGSS jGrVSSSGS GSGSGSGS.GrWG\WGR                  5        V Vs0 s H  u  pX_M	     snn GrY\GR                  " GS5      Gr[GSGS jGr\GSGS jGr]GSGS jGr^GSGS jGr_\R                  GSGS	 j5       Gr`\GRV                   " GS
 GS5      5       Gra0 GrbGS\GS'           GSGS jGrc\F" 5       GrdGS\GS'   GSGS jGreGSbGSGS jjGrfGSGS jGrg\0" GS5      Grh\0" GS5      Gri " GS GS\G\hG\i4   5      Grj\3" SAGS9GSbSASB.GSGS jjj5       GrkGSGS jGrl " GS GS\GRf                  5      Grm\R                  GSGS j5       GrnGSDGS jGroGSGS jGrpGSGS  jGrqGSGS! jGrrGS=GS" jGrsGSGS# jGrtGSDGS$ jGruGSGS% jGrvGS&GrwGSGS' jGrxGSGS( jGryGSGS) jGrz  GS         GSGS* jjGr{GSGS+ jGr|GSGS, jGr}GSDGS- jGr~GSGS. jGrGSGS/ jGr\GRV                  " SASB9 " GS0 GS15      5       Gr\GS2\%4   Gr\G\G\/G\4   Gr " GS3 GS45      GrG\" 5       GrGSGS5 jGrGSGS6 jGrGSGS7 jGrGSGS8 jGrGSGS9 jGr\F" / GS:Q5      GrGSGS; jGr\"GSGS< j5       Grgs  snn f (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     [          V s/ s H*  n [        [        U 5      R                  5       (       d  M(  U PM,     nn [	        U5      S::  d   e[	        U5      S:X  a  SnU$ UR                  5       nU$ s  sn f )Nr7   r   rH   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      O/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/utils.pyget_gpu_typerX   j   sh    &KY'%*;*H*H*J!YJKz?aZA-vHO 4>>>3CHO Ls
   'A2A2)get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubin.)rH   rJ         @      zmust be power of 2c                *    U [         -   S-
  [         * -  $ )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)nbytess    rW   _alignrx      s    [ 1$44    c                   [        U [        R                  [        R                  45      (       a#  [	        [        [        U R                  5      5      $ [        U [        5      =(       d"    [        R                  " U [        5      [        :H  $ )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrv   )vs    rW   r   r      sT    !eii+,,3{AFF+,,aK599Q#<#KKry   c                  4    \ rS rSrSrSrSr\SS j5       rSr	g)	r      z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tc                    [        U[        [        R                  45      (       a  [	        [        U5      5      $ [        U5      (       a  U$ g N)r{   intr|   Integerrx   r   )clsvalues     rW   eval
align.eval   s<    ec5==122#e*%%uL ry    N)r   
sympy.ExprreturnzOptional[sympy.Expr])
__name__
__module____qualname____firstlineno____doc__nargs
is_integerclassmethodr   __static_attributes__r   ry   rW   r   r      s!    FEJ ry   r   T)frozenc                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)GraphPartitionMap   zH
Mapping from the partition info (e.g., input/output) to the graph info
r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesr   Nr   r   r   r   r   __annotations__r   r   ry   rW   r   r      s$    
 	G -,-- ry   r   c           
        U " 5         [         R                  R                  5         [         R                  " [	        S5      [         R
                  SS9n[         R                  R                  SS9n[         R                  R                  SS9nUR                  5         [        S5       H  nUR                  5         U " 5         M     UR                  5         [         R                  R                  5         UR                  U5      S-  n[        S[	        X-  5      5      n[        S[	        X'-  5      5      n	[        U5       H
  nU " 5         M     [        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[        U	5       Vs/ s H   n[         R                  R                  SS9PM"     nn[         R                  R                  [         R                  R                  R                  /S9 n
[         R                  R                  5         [        U	5       Hp  nUR                  5         XK   R                  5         [         R                  R                   R                  S	5         U " 5         S
S
S
5        X[   R                  5         Mr     [         R                  R                  5         [         R"                  " [%        XE5       VVs/ s H  u  pUR                  U5      PM     snn5      nS
S
S
5        [         R&                  " W5      R)                  5       n[*        R-                  S5        [*        R-                  W
R/                  5       R1                  SSS95        [3        U
R5                  5        Vs/ s HI  nUR6                  [8        R                  :X  d  M#  [:        R<                  " SUR>                  5      c  MG  UPMK     sn5      nU(       a#  U[@        R&                  " S U 5       5      S-  -  n[*        R-                  SU5        U$ s  snf s  snf ! , (       d  f       GN= fs  snnf ! , (       d  f       GNK= fs  snf ):  
Returns benchmark results by examining torch profiler events.
This could be more accurate as it doesn't count CPU side overhead.
However, this also requires manually excluding irrelevant event, e.g.
vectorized_elementwise_kernel which is used to fill L2 cache,
various CUDA events, etc, so could also be fragile.
    ArH   dtypedeviceTenable_timing   r7   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  8   #    U  H  oR                   v   M     g 7fr   device_time_total.0events     rW   	<genexpr>fp8_bench.<locals>.<genexpr>  s     Q33        @@profiling results: %s ms)!rP   rH   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestabler\   eventsdevice_typer[   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rW   	fp8_benchr      s>    D	JJKKJu}}VLE **"""6K

  t 4I1X
  	JJ**959K 1c&./0H1c#+,-H 8_
  BGxQA5::##$#7KQ?DXO!!!!5IO			NN++00
 
  
 


 xAKKMN!!#&&7 8L! ! 	

 +.{+FG+F41Q^^A+FG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
#!!Z__4  HH0%**=	 #	
	O OOQQQ	

 II(#.JO RO 87
 H
 
*	
sP   'P'P!;A8P>3P&;AP>P82P>"Q) QQ&
P50P>>
QFc                4    SSK Jn  U" [        5      " XX#5      $ )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr   _do_bench_using_profiling)r   r   r   is_vetted_benchmarkingr   s        rW   do_bench_using_profilingr     s     " U*+DE
C ry   c                $   U(       d  SSK Jn  U" 5         [        5       n[        U5      nU " 5         UR	                  5         [
        R                  " [        S5      [
        R                  US9nUR                  SS9nUR                  SS9n	UR                  5         [        S5       H  n
UR                  5         U " 5         M     U	R                  5         UR	                  5         UR                  U	5      S-  n[        S[        X-  5      5      n[        S[        X+-  5      5      n[        U5       H
  n
U " 5         M     UR	                  5         [
        R                  R                  [!        [
        R                  R"                  UR%                  5       5      /S	9 n[        U5       H  n
UR                  5         U " 5         M     UR	                  5         S
S
S
5        [&        R)                  S5        [&        R)                  WR+                  5       R-                  SSS95        [/        UR1                  5        Vs/ s H7  nUR2                  [4        R6                  :X  d  M#  UR8                  S:w  d  M5  UPM9     sn5      n[;        U5      U-  S:w  a  [=        SU[;        U5      U5      e[;        U5      U-  n[/        [?        U5       VVs/ s H  u  nnUU-  S:w  d  M  UPM     snn5      nURA                  5         UR+                  5       n[&        R)                  S5        [&        R)                  UR-                  SS95        [C        S U 5       5      S-  U-  n[&        R)                  SU5        U$ ! , (       d  f       GN= fs  snf s  snnf )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   zContext SynczWFailed to divide all profiling events into #repeat groups. #%s events: %d, #repeats: %szprofiling time breakdown)r   c              3  8   #    U  H  oR                   v   M     g 7fr   r   r   s     rW   r   ,_do_bench_using_profiling.<locals>.<genexpr>  s     A=%%%=r   r   r   )"r   r   rX   rY   r   rP   r   r   r   r   r   r   r   r   r   r   rO   r   upperr   r   r   r   r\   r   r   r[   r   r   rR   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   device_interfacer   r   r   r   r   r   r   r   r   r   num_event_per_groupr   actual_eventsr   s                        rW   r   r   -  s    "M.K/<D  "KKJuyyME #((t(<K &&T&:I1X
    "**959K 1c&./0H1c#+,-H 8_
    "			ENN33[5F5F5HI
 
  
 
xAKKMD	 ! 	$$&
 IIlIIann$$-EQS$TU 	
#  JOO3 8=

n8T #	
O ?h&!++ 
 	
 o.9 &o6	
65&&!+ 6	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.Ja
 
$	
 	
s*   :M5="N#N5NN
 N
5
Nc                     SSK Jn   [        R                  R	                  SS5        U S L=(       a%    [        [        [        R                  SS 5      S5      $ ! [         a     g[         a  nS[        U5      ;   d   e S nAgS nAff = f)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rP   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrO   opsImportErrorr   str)r  r   s     rW   has_torchvision_roi_alignr    s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 
B$	B-BBc                t   U c   [         R                  " S5      R                  $ [        U [        5      (       a  [         R                  " U 5      n U R
                  S;  aY  U R                  cL  [        U R
                  5      n[         R                  " U R
                  UR                  R                  5       S9$ U $ )Ng        )cpumeta)index)
rP   r   r   r{   r  typer  rY   Workercurrent_devicer   r  s     rW   decode_devicer    s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMry   c                ~    [         R                  " [        R                  U [        R
                  R                  5      $ r   )	functoolsreduceoperatormulr|   SOne)its    rW   sympy_productr"    s#    HLL"eggkk::ry   c           	         [        U 5      [        U5      :X  d   e[        R                  " [        S [	        X5       5       5      5      $ )Nc              3  .   #    U  H  u  pX-  v   M     g 7fr   r   )r   abs      rW   r   sympy_dot.<locals>.<genexpr>  s     >odaAEos   )rR   r|   expandr  r   )seq1seq2s     rW   	sympy_dotr+    s6    t9D	!!!<<>c$o>>??ry   c                b    U  Vs0 s H  n[        U5      U_M     snR                  5       $ s  snf r   )r   values)r!  rT   s     rW   uniquer.    s+     !bBqE1Hb!((**!s   ,c           
        [        U [        R                  5      (       d  [        U[        R                  5      (       a4  [        [        R                  " U 5      [        R                  " U5      5      $ [        U [
        5      (       a  [        U[
        5      (       d$   U  S[        U 5       SU S[        U5       35       e[        X5      $ )Nz: , )r{   r|   Exprr_   sympifyr   r  runtime_ceildiv)numberdenoms     rW   rj   rj     s     &%**%%E5::)F)Fu}}V,emmE.BCC fc""z%'='= ("T&\N"UG2d5k];= 6))ry   c                t   U c  g[        U 5      R                  S5      S   n0 SS_SS_SS	_S
S_SS_SS_SS	_SS_SS_SS_SS_SS_SS_SS_SS_SS _S!S"_SS#S$S%S&.EnUR                  [        UR	                  5       5       Vs0 s H  o3U_M     sn5        [        U [         5      (       a  U $ S'X!    3$ s  snf )(Nz*i8rp   r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r  splitupdatelistr-  r{   )key	dtype_strtysr   s       rW   _type_ofrc    sW   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<0101012S#&&3@a/?,@@ 2s   B5c                Z    U  Vs/ s H  n[         R                  " U5      PM     sn$ s  snf )z
Gets the shape and stride of a tensor. For non-symbolic tensors, this is
trivial. But for symbolic tensors, we need to map from SymIntNode into
sympy.Expr.
)r|   r2  lstr   s     rW   convert_shape_to_inductorrg    s%     '**cEMM!c***s    (c                p    [        U [        R                  5      (       a  U R                  R                  $ U $ )z
Convert SymInt to sympy.Expr, leave int as is.

Unlike sympy.sympify() which converts int to sympy.Integer,
this function preserves int as int and only converts SymInt to Expr.
)r{   rP   r2   nodeexprvals    rW   convert_symint_to_exprrm    s(     #u||$$xx}}Jry   c                    SSK Jn  [        U [        5      (       a  U $ [        U [        R
                  5      (       a  [        U 5      $ UR                  R                  R                  R                  U SS9$ )zD
Like convert_shape_to_symint, but operates on a single expression.
r7   VN)hint)
virtualizedrp  r{   r   r|   r   graphsizevars	shape_envcreate_symintnode)r   rp  s     rW   convert_to_symintrw     sk      a 	

 !U]]++ F	 !!++==ad=Kry   c                D    U  Vs/ s H  n[        U5      PM     sn$ s  snf )zn
Takes a list of shapes from Inductor and converts them into symints (or just
ints if all shapes are static).
)rw  re  s     rW   convert_shape_to_symintry    s"     +..#Qa #...s   c                N    [        S U R                  R                   5       5      $ )z%
Does this op overload have aliasing
c              3  <   #    U  H  oR                   S Lv   M     g 7fr   )
alias_infor   r%  s     rW   r   is_view.<locals>.<genexpr>  s     F1EA||4'1Es   )any_schema	argumentsops    rW   is_viewr    s     F1E1EFFFry   c                    gNFr   )r   s    rW   <lambda>r  $  s    ry   c                  ^ U R                   S:w  a  g[        U R                  [        R                  R
                  5      (       d  U R                  [        R                  L d  g[        [        R                  R
                  U R                  5      nU[        R                  L d  [        U5      (       a  [        U4S jU R                   5       5      $ [        R                  R                  UR                  ;   =(       d    T" U5      $ )z
Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

Uses in views ops will follow the views uses
call_functionFc              3  <   >#    U  H  n[        UT5      v   M     g 7fr   )is_pointwise_use)r   uis_pointwise_fns     rW   r   #is_pointwise_use.<locals>.<genexpr>5  s     KA#A77s   )r  r{   targetrP   _ops
OpOverloadr  getitemr   r  r   usersTag	pointwisetags)user  r  s    ` rW   r  r  "  s     vv 3::uzz4455xGWGW9W%**''4F!!!WV__KKKK99&++-H1HHry   	list[Any]c           	       ^^ [         R                  R                  5       m/ mSUU4S jjnTR                  " U /[	        [         R
                  X1U45      Q76 n[        U R                  R                  5      S:X  a3  [        U R                  R                  S   R                  5      S:X  a  U4nTR                  U5        [         R                  R                  0 T5      nUT4$ )Nc                `   > TR                  U 5        TR                  S[        T5       35      $ )Narg)appendplaceholderrR   )r  g
graph_argss    rW   add_tensor_arg)gen_gm_and_inputs.<locals>.add_tensor_arg@  s,    #}}s3z?"3455ry   r7   r   Tensor)r  torch.Tensorr   r5   )rP   fxGraphr  r%   r  rR   r  returnsr  r  outputr4   )r  r   kwargsr  ri  gmr  r  s         @@rW   gen_gm_and_inputsr  :  s     	A%'J6 6 ??u||^F^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>ry   c                t    U S:X  a  g [        U 5      nUR                  5       (       a  UR                  5         g g Nr  )rY   rQ   r   r  s     rW   r   r   R  s7    /7$$&&$$& 'ry   c                    [        U5        [        R                  " S5        [        R                  " 5       n[        U5       H  nU " U6 n[        U5        M     [        R                  " 5       nWc   eXt-
  $ )Ni9  )r   rP   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rW   timedr  Z  sk     	d				B5\'F  
			B7Nry   c                    [         R                  " [        U5       Vs/ s H  n[        XX%5      PM     sn5      n[         R                  " U5      U-  n[        X-  S 5        UR                  5       $ s  snf )Nz.6f)rP   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rW   print_performancer  l  se     ll>CFmLmue	4mLG << 5(D	T_S!#99;	 	Ms   A3c                F   ^ [        X5      " 5       m[        XU4S j5        g)zKReplace obj.method() with a new method that returns a precomputed constant.c                    > T $ r   r   )r  s   rW   r  #precompute_method.<locals>.<lambda>  s    ry   N)rO   setattr)objmethodr  s     @rW   precompute_methodr  |  s    S!#FC(ry   c                ,    U H  n[        X5        M     g)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rW   precompute_methodsr    s    #& ry   c                8    [        X:  5      [        X:  5      -
  $ r   )r   )r%  r&  s     rW   cmpr    s    qu:AE
""ry   c                    [        U [        5      (       a  U /U-  $ [        U 5      S:X  a  [        U 5      " U S   /5      U-  $ U $ )Nr7   r   )r{   r   rR   r  )rT   sizes     rW   pad_listliker    sD    !SsTz
1v{Aw!v%%Hry   c                @    [        U 5      S:X  a  / $ SS jn[        XS9$ )Nr   c                    [        U [        5      (       a  U $ SSKJn  [        X5      (       d   eU R	                  5       $ )Nr7   )rF   )r{   r  	schedulerrF   get_name)elemrF   s     rW   	sort_functuple_sorted.<locals>.sort_func  s4    dC  K0$2222}}ry   r`  )r  rl   r   r  )rR   sorted)rT   r  s     rW   tuple_sortedr    s$    
1v{	 !##ry   PRV)	covariantc                  2    \ rS rSr\SS j5       rSS jrSrg)CachedMethodi  c                    g r   r   )r   s    rW   clear_cacheCachedMethod.clear_cache  s    ),ry   c                    g r   r   selfr   r  s      rW   __call__CachedMethod.__call__  s    ry   r   N)r   r   r   None)r   P.argsr  P.kwargsr   r  )r   r   r   r   staticmethodr  r  r   r   ry   rW   r  r    s    , ,Dry   r  c           	        ^ U R                   nSU S3mSU 0n[        SU ST ST S3R                  5       U5        [        R                  " U 5      " X! S3   5      nS
U4S	 jjnXCl        U$ )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                B   > [        U T5      (       a  [        U T5        g g r   r  delattrr  r`  s    rW   r  "cache_on_self.<locals>.clear_cache  s    4D# ry   r  r   r   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  r`  s        @rW   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH oob!#n&=">?G &Nry   c                    [        U 5      $ )zU
Variant of cache_on_self for properties. The only difference is the type signature.
)r  )r   s    rW   cache_property_on_selfr    s     ry   c                    ^      SU 4S jjnU$ )Nc           	        >^ ST SU R                    S3mSU 0n[        ST ST ST S3R                  5       U5        [        R                  " U 5      " US	   5      nSU4S
 jjnX2l        U$ )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerc                B   > [        U T5      (       a  [        U T5        g g r   r  r  s    rW   r  <cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s    tS!!c" "ry   r  r  )r   r  r  r  r`  
class_names       @rW   r  'cache_on_self_and_args.<locals>.wrapper  s     :,a}F3 Rj' (+e ,//2e 4!U #$ )	
, #CL1	# (ry   )r   FN_TYPE[P, RV]r   r  r   )r  r  s   ` rW   cache_on_self_and_argsr    s     
$$	$L Nry   c           
        SSK Jn  [        U [        5      (       ay  [        R
                  " [        R                  U  Vs/ s H?  n[        US5      (       d  M  UR                  (       d  M)  UR                  R                  PMA     sn[        5       5      $ [        XR                  5      (       a  U R                  $ [        5       $ s  snf )Nr7   irri  ) r  r{   r_  r  r  r  or_r  ri  originsr#   r?   )node_scheduler  ri  s      rW   aggregate_originsr    s     -&&LL * *D4( "-1YY "		!!) L	
 		
 
M??	3	3$$$|s   C
C
+C
c                `   [        U 5      nUS:X  ag  S nU Vs/ s HA  nUR                  S:X  d  M  SUR                  ;   d  M'  UR                  S   c  M9  U" U5      PMC     nn[        [	        U5      5      nGOUS:X  a  / nU H  nUR                  S:X  d  M  S nSnSUR                  ;   a  UR                  S   S   nO$SUR                  ;   a  UR                  S   S   nS	nU(       d  Mi  [        US
   [        5      (       a  UR                  US
   U-   5        M  UR                  US
   R                  U-   5        M     [        [	        U5      5      nO:US:X  a.  U Vs/ s H   oDR                  S:X  d  M  UR                  PM"     nnO[        eSR                  S/U-   5      $ s  snf s  snf )Noriginal_atenc                .   U R                   S   nSn[        U[        R                  R                  5      (       a  UR
                  R                  nU$ [        U[        R                  R                  5      (       a  [        UR                  5       5      nU$ )Nr  r  )
r  r{   rP   r  r  _overloadpacketr   HigherOrderOperatorr  r   )originr  r`  s      rW   get_origin_meta_str2get_fused_kernel_name.<locals>.get_origin_meta_str$  su    "KK8MC-)>)>??#33<< J M5::+I+IJJ-,,./Jry   r  rP   r  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_noder   fused)r  r  r  r  r#   r{   r  r  r   r   NotImplementedErrorjoin)r  descriptive_namesall_originsr  r  sources	source_fnsuffixs           rW   get_fused_kernel_namer!    s    $M2KO+	 &
%yyO+ (  6;;. ( O,	 ('% 	 
 G,-	g	%!FyyO+ 	$3 &,= >r BI*fkk9 &,A B2 FI'F ilC00NN9Q<&#89NN9Q<#8#86#AB "" G,-	o	-&1
&1FYY/5QKFKKk 	 
 "!88WI'((G
<
s"   F&F&F&F&!F+8F+c                  ^^ ^! [        U 5      nU Vs/ s H  o3R                  S:X  d  M  UPM     nn[        R                  " [        5      n[        R                  " [        5      nSm U(       a  [        S U 5       5      n[        U5      S:X  ac  US   R                  m [        T S5      (       d0  [        T R                  5       VV	s0 s H  u  pX_M	     n
nn	U
T l        UR                  U 4S jS9  U GHo  nS	UR                  ;   a  UR                  S	   b  UR                  S	   nSn[        U[        R                   R"                  5      (       a  [%        UR&                  5      nOB[        U[        R                   R(                  5      (       a  [%        UR+                  5       5      nU(       a  Xm   R-                  UR*                  5        S
UR                  ;   a<  UR                  S
   S   R*                  nX]   R-                  UR*                  5        GM&  UR                  R/                  S5      S:X  d  GMH  X[R*                     R-                  UR*                  5        GMr     T b  SOSnUR0                   SU SSR3                  UR5                  5       5       SSR3                  UR5                  5       5       S3nUR0                   S3/n[7        UR9                  5       5       HA  u  nnUR-                  UR0                   SU SSR3                  [7        U5      5       35        MC     T Gb  SSKJm  UR-                  UR0                   S35        [        5       n/ n[        U TR>                  5      (       Gd  SSK J!n        S)U4S jjnS*S jm!S+U!4S jjnU  GH  n	[        U	S5      (       a  U	RD                  c  M$  [        U	RD                  S5      (       a  U	RD                  RF                  b  U	RD                  RF                   H  nUR*                  U;   a  M  URI                  UR*                  5        UR                  RK                  UR*                  5      nUc  MZ  U" UUR*                  5      u  nnUR-                  UR0                   SU S U" U5       S!U S35        M     [        U	RD                  S"5      (       d  GM+  U	RD                  RL                  c  GME  U	RD                  RL                   HW  nUR                  RK                  UR*                  5      nUc  M-  U" UUR*                  5      u  nnUR-                  S#U-   5        MY     GM     U H0  nUR-                  UR0                   SURO                  S$S%9 35        M2     UR-                  UR0                   S&S'R3                  U5       35        US(R3                  U5      4$ s  snf s  sn	nf ),a  
Retrieves metadata information for a kernel.
Args:
    node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
        Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
    wrapper (PythonWrapperCodegen):
        An instance of PythonWrapperCodegen, used to define the code comment format.
Returns:
    tuple[str, str]:
        A tuple containing two strings:
            - The first string represents the kernel's metadata.
            - The second string represent the kernel's detailed metadata.
r  Nc              3  8   #    U  H  oR                   v   M     g 7fr   )rs  )r   ns     rW   r   &get_kernel_metadata.<locals>.<genexpr>q  s     "CNq77Nr   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                "   > TR                   U    $ r   )r&  )r$  single_graphs    rW   r  %get_kernel_metadata.<locals>.<lambda>y  s    lTTUVWry   r  r  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [r0  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:ro  c                R  > [        U TR                  5      (       aF  [        U R                  TR                  5      (       a!  U R                  R                  R                  nOU R                  nUc  UnOUR
                  n U R                  5       nX44$ ! [         a    S n X44$ f = fr   )r{   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer4  r   layoutr  s        rW   get_buffer_info,get_kernel_metadata.<locals>.get_buffer_info  s     fbll33
KK9 9 #)++"2"2">">K"("4"4K&"D&++D"#..0F |# + "!F|#"s   B B&%B&c           	     j    SSR                  U  Vs/ s H  n[        U5      PM     sn5       S3$ s  snf )N[r0  r/  )r  r  )shaperT   s     rW   stringify_shape,get_kernel_metadata.<locals>.stringify_shape  s1    499e%<ec!fe%<=>a@@%<s   0
c                   > U c  gT" U R                   5       nT" U R                  5       nU R                   nS[        U R                      U U U S3$ )Nr  ")r  strider   r"   r   )r8  shape_annotationstride_annotationdevice_annotationr>  s       rW   stringfy_layout,get_kernel_metadata.<locals>.stringfy_layout  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?ry   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r6  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r7  r  r   ztuple[str, ir.Layout | None])r=  zIterable[int]r   r  )r8  zir.Layout | Noner   r  )(r  r  collectionsdefaultdictr_  r#   rR   rs  r  r   nodesr&  sortr  r{   rP   r  r  r  r  r  r   r  getcommentr  keysr  itemsr  r  r?   rr  rp  rH  rI  addtry_get_bufferrJ  format_node)"r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr$  node_to_idx_mapri  r  r`  sort_strmetadatadetailed_metadataoriginal_noderQ  	all_reads
all_writesrp  r9  rF  rr6  
input_namer8  woutput_namer   r  r(  r>  s"                                  @@@rW   get_kernel_metadatarj  T  s   $ $M2K+6W;)):Vf;NW ,,T2N$006
 L""CN"CC}")!,22L<)TUU8A,BTBT8U"V8Ufc168U"VIXFW    dii'DIIo,F,R IIo6MC-)>)>??-778M5::+I+IJJ-,,./"'..tyy9$))#))K(+00C&&tyy1YY]],->99%,,TYY7   *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= >u  s=/diiu6N5OP	
 !?   GOO#44D!EF%/\	 "
-99&$J$UX$-$(A
 #q-00AMM4I1=='22q}}7J7J7V]]0066Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ 1 AMM844,,8]]11!"!7!7!?!>$)8)HQ"))#*;< 2- #< #D$$??#3t'7'7PT'7'U&VW #
 	  GOO#4Jsxx
?S>T!UVTYY0111I X #Ws   WWWc                   [        U 5      n [        U 5      nU (       ak  U R                  5       nUR                   HB  nU(       a  U" U5      (       a  M  XB;  d  M   UR	                  U5        U R                  U5        MD     U (       a  Mk  U$ )zJReturns the set of nodes whose values depend on those within initial_queue)r_  r#   rS   r  rW  r  )initial_queueskip_filterdominated_setri  users        rW   dominated_nodesrp    sx    
 'M}-M
  "JJD{400(!!$'$$T*  - ry   c                Z  ^^	 SSK Jm  SUU	4S jjm	[        U5      u  p#U Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[        U 5      u  pcU Vs/ s H  nT	" U5      (       d  M  UR                  PM      nn[	        [
        R                  " / UQUQ76 5      $ s  snf s  snf )Nr7   r  c                l  > [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      (       a  T" U R                  5      $ [        U TR                  5      =(       a=    [        U TR
                  TR                  TR                  TR                  45      (       + $ r   )	r{   r1  r2  r3  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)r$  r  is_unrealized_nodes    rW   rw  *gather_origins.<locals>.is_unrealized_node  s    a&&%aff--a''%aff--!RYY' 

!!!!	1
 -
 	
ry   )r$  r@   r   r7  )r  r  r$   r
  r#   	itertoolschain)
r   r  kwargs_flattenr   rl  kwargs_originsargs_flattenargs_originsr  rw  s
           @@rW   gather_originsr    s     
 
" %V,N-;W^c?QRU?Vkckk^NW"4(OL+7S<C;Mc;RKCKK<LSiooE|EnEFF XSs   B#B#B(0B(c                X   ^^^^ SS jmSUU4S jjmSUU4S jjmSU4S jjmT" U 5      $ )z
Normal sympy str is very slow, this is a lot faster.  The result are
somewhat worse, as it doesn't do as much simplification.  So don't
use this for final codegen.
c                    [        U [        R                  5      =(       a1    [        U R                  5      S:H  =(       a    U R                  S   S:H  $ )N   r   r   )r{   r|   MulrR   r   )rj  s    rW   is_neg_leadsympy_str.<locals>.is_neg_lead&  s:    tUYY'VC		Na,?VDIIaLTVDV	
ry   c                v  > [        U [        R                  5      (       a  [        U R                  5      S:X  aT  T" U R                  S   5      (       a:  T" U R                  S   5       ST" U R                  S   R                  S   5       3$ SR                  [        TU R                  5      5      $ T" U 5      $ )Nr  r7   r   z - z + )r{   r|   r}   rR   r   r  r   )rj  r  sympy_str_muls    rW   sympy_str_add sympy_str.<locals>.sympy_str_add+  s    dEII&& 499~"{499Q<'@'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&ry   c                   > [        U [        R                  5      (       aJ  T" U 5      (       a  ST" U R                  S   5       3$ SR	                  [        TU R                  5      5      $ T" U 5      $ )N-r7   z * )r{   r|   r  r   r  r   )rj  r  sympy_str_atoms    rW   r   sympy_str.<locals>.sympy_str_mul6  sa    dEII&&4   >$))A,7899zz#ndii"@AA!$''ry   c                  > [        U [        R                  5      (       a  U R                  $ [        U [        R                  [        R
                  45      (       a  ST" U 5       S3$ [        U [        [        [        [        45      (       aC  U R                  R                   SSR                  [        [        U R                  5      5       S3$ [!        U 5      $ )N()r0  )r{   r|   Symbolr   r}   r  rc   r`   ra   rb   funcr   r  r   	sympy_strr   r  )rj  r  s    rW   r  !sympy_str.<locals>.sympy_str_atomA  s    dELL))99uyy%))455}T*+1--(HMNNii(()499SDII5N+O*PPQRRt9ry   )rj  r   r   r7  rj  r   r   r  r   )rj  r  r  r  r  s    @@@@rW   r  r    s.    

	' 	'	( 	( ry   c                    SSK Jn  [        R                  (       a9  [	        UR
                  SS 5      =n(       a  UR                  S:w  a  [        U 5      $ [        R                  " 5       $ )Nr7   ro  current_node
index_expr)
rr  rp  ri   compute_all_boundsrO   interpreterr  rf   rg   unknown)r  rp  fx_nodes      rW   get_bounds_index_exprr  N  sN     	!!~tDDWDNNl*5!!""$$ry   c                    U S   S:H  $ )Nr   rf  r   )prefixs    rW   prefix_is_reductionr  \  s    !9ry   c                D    U [         R                  :w  d   e[        XSSS9$ )1
Used to generate an integer-nonnegative symbol.
Tintegernonnegative)re   SIZErd   )r  r^  s     rW   sympy_index_symbol_with_prefixr  `  s'     TYY vDdCCry   c                b    U =(       d    [         R                  =(       a    [         R                  $ r   )ri   debug_index_assertsassert_indirect_indexing)checks    rW   generate_assertr  l  s    /V//TV5T5TTry   c                D    U S   S:w  d   e[         R                  " U SSS9$ )r  r   r   Tr  )r|   r  r   s    rW   sympy_index_symbolr  p  s)     7c>> <<d==ry   c                          SS jn[         R                  " U 5      R                  UR                  5        VVs0 s H  u  p4X2" X45      _M     snn5      $ s  snnf )z
When the passed replacement symbol v is a string, it is converted to a symbol with name v that
have the same replaced expression integer and nonnegative properties.
c                    [        U [        R                  5      (       d   e[        U[        5      (       a*  [        R                  " UU R
                  U R                  S9$ U$ )Nr  )r{   r|   r1  r  r  r   is_nonnegative)replacedreplacements     rW   	to_symbolsympy_subs.<locals>.to_symbol  sV     (EJJ////k3''<< ++$33  ry   )r  r   r  zUnion[sympy.Expr, str]r   sympy.Symbol)r|   r2  xreplacerV  )rj  replacementsr  kr   s        rW   
sympy_subsr  |  sh    +A	 ==''(4(:(:(<=(<IaO	(<= =s   A
c                   [        U [        R                  5      =(       dd    [        U [        R                  5      =(       aC    [	        S [
        R                  " U R                  5       U R                  5       5       5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   is_symbolicr   rT   s     rW   r   is_symbolic.<locals>.<genexpr>  s     N(M1A(Mr   )	r{   rP   r2   r  r  ry  rz  r  rB  )r%  s    rW   r  r    sS    a& 1ell# 	ON	!((*(MNNry   c                 &    [        S U  5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7fr   r  r}  s     rW   r   "any_is_symbolic.<locals>.<genexpr>  s     ,t!{1~~tr   r  )r   s    rW   any_is_symbolicr    s    ,t,,,ry   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarc                    SSK Jn  U R                  R                   HH  n[	        U5      (       a  Us  $ UR
                  R                  S5      =nc  M7  U" U5      (       d  MF  Us  $    g )Nr   )r)   rl  )%torch.fx.experimental.symbolic_shapesr)   rs  rQ  is_cudagraph_unsafe_fx_noder  rS  )r  r)   ri  rl  s       rW   %get_first_incompatible_cudagraph_noder    sW     L&t,,K99==''C49Ns9S9SK  ry   c                    [        [        [        U R                  R                  5      5      5      nUR
                  S:X  d   eU$ )z$Get the output node from an FX graphr  )nextiterreversedrs  rQ  r  )r  	last_nodes     rW   output_noder    s6    T(288>>234I<<8###ry   c                    U R                   R                  SS9n[        S U 5       5      n[        U 5      R                  S   n[        U[        5      (       a  UOU4n[        S U 5       5      nX%-  $ )Nr  r  c              3     #    U  HX  n[        UR                  R                  S 5      [        R                  5      (       d  M=  UR                  S    R
                  v   MZ     g7frl  N)r{   r  rS  rP   r  r   )r   ri  s     rW   r   "get_all_devices.<locals>.<genexpr>  sC      9%DdiimmE*ELL9 	 		%%s   <A" A"r   c              3    #    U  H  n[        U[        R                  R                  5      (       d  M.  [        UR                  R                  S 5      [        R                  5      (       d  Mh  UR                  S    R                  v   M     g7fr  )r{   rP   r  r5   r  rS  r  r   )r   r  s     rW   r   r    s[      7Cc588==) 	 sxx||E*ELL9 	s   -B6B- B)rs  
find_nodesr#   r  r   r{   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rW   get_all_devicesr    s~    ++}+=.8 9%9 /M "o""1%G$We44w7*H,6 77 -K &&ry   c                    [        [        R                  R                  5       5       GH5  n U R	                  S5      (       d  M  [        R                  U    nUR
                   H  nUR	                  S5      (       d  M  [        X5      n[        U[        R                  R                  R                  R                  5      (       d  Me  UR                   Hp  n[        U[        R                  R                  R                  R                  5      (       d  MB  UR                  R                   R"                  R%                  5         Mr     M     [        R                  U 	 GM8     S[        R                  ;   aR  [        R                  S   n['        UR(                  R*                  R,                  5      ?UR(                  R*                  ?[0        R2                  " 5         g )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)r_  sysmodulesrU  
startswith__dict__rO   r{   rP   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rW   unload_xpu_triton_pydsr    sJ   CKK,,./%%&NOOKK$I##I.. .EOO33EEVV  #)"8"8%"!OO33EEYY 
 #MM--1199; #9 $ KK$# 0( #++-kk12""(()2JJ#JJLry   _registered_cachesc                    [        U S5      (       a  [        U R                  5      (       d  [        U  S35      e[        R                  U 5        U $ )z\
Use this decorator to register any caches that should be cache_clear'd
with fresh_cache().
cache_clearz# does not have a cache_clear method)r  callabler  AttributeErrorr  r  r  s    rW   clear_on_fresh_cacher     sE    
 3&&hs.G.Gu$GHIIc"Jry   c                 >    [          H  n U R                  5         M     g)z
Clear all registered caches.
N)r  r  r  s    rW   clear_cachesr    s     " "ry   c              #    ^#    [        5         SSKJn  U" [        R                  " US95      m [
        R                  R                  [        R                  ST05         [        R                  ST5        U" [        R                  R                  TS5      5      n[
        R                  R                  [        R                  SU05         Sv   [        U [        5      (       a  [        U 5      S:X  d   S	5       e[        R                  R!                  U5      (       a{  [        R"                  " U5      nU R%                  U Vs0 s HH  nS
U;  d  M  U[        R                  R'                  [        R                  R                  XF5      5      _MJ     sn5        SSS5        SSS5        U(       a^  [)        5       (       a-  [*        R,                  R/                  5       (       a
  [1        5         [2        R4                  " T[)        5       U4S jS9  [        5         gs  snf ! , (       d  f       N= f! , (       d  f       N= f! [6         a    [        R9                  ST5        e f = f! [        5         f = f7f)z
Contextmanager that provides a clean tmp cachedir for pt2 caches.

Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
generated with this cache instance.
r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                .   > [         R                  STUS9$ )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  pathr
  inductor_cache_dirs      rW   r  fresh_cache.<locals>.<lambda>E  s    S[[@&% 6A 6ry   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictosenvironr   r   r  r  r{   rR   existslistdirr^  getsize
is_windowsrP   rJ   rQ   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rW   fresh_cacher$    s     ND1(2B2Bs2KL)ZZ__JJ24FG
 II35GH7/:  .@BR-STmT22}-2W4WW2ww~~&677 "

+; <%,, */).A#*!#3 !V277??277<<@P3T#U U). U
( ||		 6 6 8 8&(MM" )l  	5 UT
 
H  >@RS 	st   +I60I A-H1A9H 
HAHH H1#A-I I6H  
H.	*H11
H?;I "I$$I' 'I33I6)reversec                   U R                   n[        [        U 5      5      n[        [	        X2SS95      nU(       d  [        [        U5      5      $ U$ )NTr`  r%  )__getitem__r   rR   r_  r  r  )seqr%  gettera_rsort_idxs        rW   argsortr-  X  sC    __F
C/C F3D9:HHX&''Ory   c          	     F  ^  SU 4S jjn[        U5       VVs/ s H>  u  pEU[        U[        R                  5      (       a  UR                  R
                  OU4PM@     nnn[        U[        R                  " U5      US9nU VVs/ s H  u  pGUPM	     nnnU$ s  snnf s  snnf )Nc                ~   > U u  p#Uu  pESU4S jjnU" X5:  5      (       a  gU" X5:  5      (       a  gX$:  a  gX$:  a  gg)Nc                R   > [        U [        5      (       a  U $ TR                  U SS9$ )NT)size_oblivious)r{   r7  evaluate_expr)rj  ru  s    rW   evaluate*argsort_sym.<locals>.cmp.<locals>.evaluater  s+    $%%**4*EEry   r   r7   r   )rj  z%Union[bool, torch.SymInt, sympy.Expr]r   r7  r   )r%  r&  a_idxa_valb_idxb_valr3  ru  s          rW   r  argsort_sym.<locals>.cmpn  sN    	F
 EM""EM""
 ==ry   r'  )r%  tuple[int, sympy.Expr]r&  r:  r   r   )	r   r{   rP   r2   ri  rj  r  r  
cmp_to_key)	ru  r)  r%  r  r^  r   exprsr   r  s	   `        rW   argsort_symr=  h  s    4  n$FC 
Z5<<88affkka@$ 
  5i2237IE %&fccF&M
 's   ABBc                r    U [         R                  :X  a  g[         R                  " SU S9R                  5       $ )Nrt   r   r   )rP   r[  r   element_sizer?  s    rW   get_dtype_sizerA    s-     ;;r'4466ry   c                       \ rS rSr% S\S'   Srg)LineContexti  r   contextr   Nr   r   r   r   r   r   r   ry   rW   rC  rC    s    Lry   rC  c                  *    \ rS rSr% S\S'   S\S'   Srg)ValueWithLineMapi  r  r   zlist[tuple[int, LineContext]]line_mapr   NrE  r   ry   rW   rG  rG    s    J++ry   rG  c                     \ rS rSrSrSSS jjr\R                  SS j5       rSS jr	SS jr
SS jrSS jrSS	 jrSS
 jrSS jrSS jr    S S jrS!S"S jjrS!S#S jjrS!S#S jjr S$     S%S jjrS&S jrSS jrS'S jrS(S jrSrg))IndentedBufferi     c                    / U l         Xl        g r   )_lines_indent)r  initial_indents     rW   __init__IndentedBuffer.__init__  s    GI%ry   c              #  \   #    U R                   n Xl         S v   X l         g ! X l         f = f7fr   )tabwidth)r  rS  prevs      rW   set_tabwidthIndentedBuffer.set_tabwidth  s%     }}	!$M MDMs   ,
! ,),c                   [        5       nSn/ nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O5[        U[        5      (       a  UR                  X$R                  45        MX  Un[        U[        5      (       d   eUR                  U5        UR                  S5        USUR                  S5      -   -  nM     [        UR                  5       U5      $ )Nr7   rN  )r   rM  r{   DeferredLineBaserC  r  rD  r  writecountrG  getvalue)r  bufr   linemaplilines         rW   getvaluewithlinemap"IndentedBuffer.getvaluewithlinemap  s    j13++B".//t<  B,,::/dC((((IIdOIIdOTZZ%%%A   88ry   c                6    U R                  5       R                  $ r   )r`  r   r  s    rW   r[  IndentedBuffer.getvalue  s    '')///ry   c                   [        5       nU R                   H  n[        U[        5      (       a  U" 5       nUc  M$  O[        U[        5      (       a  M<  Un[        U[
        5      (       d   eUR                  S5      (       a  UR                  US S 5        M  UR                  U5        UR                  S5        M     UR                  5       $ )N\r   rN  )	r   rM  r{   rX  rC  r  endswithrY  r[  )r  r\  r^  r_  s       rW   getrawvalueIndentedBuffer.getrawvalue  s    j++B".//t<  B,,dC((((}}T""		$s)$		$		$   ||~ry   c                8    U R                   R                  5         g r   )rM  clearrc  s    rW   rk  IndentedBuffer.clear  s    ry   c                ,    [        U R                  5      $ r   )r7  rM  rc  s    rW   __bool__IndentedBuffer.__bool__  s    DKK  ry   c                :    SU R                   U R                  -  -  $ )Nr.  )rN  rS  rc  s    rW   r  IndentedBuffer.prefix  s    dllT]]233ry   c                &    U R                  S5        g )NrN  	writelinerc  s    rW   newlineIndentedBuffer.newline  s    try   c                   [        U[        5      (       a  U R                  R                  U5        g [        U[        5      (       a9  U R                  R                  UR                  U R                  5       5      5        g UR                  5       (       a.  U R                  R                  U R                  5        U 35        g U R                  R                  S5        g Nr  )r{   rC  rM  r  rX  with_prefixr  stripr  r_  s     rW   rt  IndentedBuffer.writeline  s    dK((KKt$.//KKt//>?ZZ\\KK$++-78KKr"ry   c                8    U H  nU R                  U5        M     g r   rs  )r  linesr_  s      rW   
writelinesIndentedBuffer.writelines  s     DNN4  ry   c                L   ^ ^ [         R                  SUU 4S jj5       nU" 5       $ )Nc               3     >#    T=R                   T -  sl          S v   T=R                   T -  sl         g ! T=R                   T -  sl         f = f7fr   rN  )offsetr  s   rW   r  "IndentedBuffer.indent.<locals>.ctx  s8     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r  r  s   `` rW   indentIndentedBuffer.indent  s$    		"	"	' 
#	' ury   c                .    U =R                   U-  sl         g r   r  r  r  s     rW   	do_indentIndentedBuffer.do_indent      ry   c                .    U =R                   U-  sl         g r   r  r  s     rW   do_unindentIndentedBuffer.do_unindent  r  ry   c           	        [        U[        5      (       a  [        S5      nUR                   HR  n[        U[        5      (       a  M  U(       d  M#  [        U[        U5      [        UR                  5       5      -
  5      nMT     [        R                  " U5      (       a  SnUR                   HV  n[        U[        5      (       a  U R                  R                  U5        M5  [        R                  X[        U5      S  5        MX     g [        R                  " U5      nU(       a  UR                  5       nU(       d  g UR                  5       nUR!                  S5       H  nU R                  U5        M     g )Ninfr   rN  )r{   rJ  floatrM  rC  minrR   r  mathisinfr  rt  r   textwrapdedentrstripr]  )r  
other_coderz  r  r_  r   s         rW   spliceIndentedBuffer.splice
  s    j.115\F"))!$44 TS5G)GHF * zz&!!"))dK00KK&&t,",,TF3FG	 * "4J'..0
#**,J%%d+q! ,ry   c                    [        U R                  S9nU R                   Vs/ s H
  o1" U5      PM     snUl        U$ s  snf N)rO  )rJ  rN  rM  )r  r  r   r_  s       rW   r   IndentedBuffer.map$  s8    DLL9-1[[9[Td4j[9

 :s   =c                @    [        U 5       SU R                  5        S3$ )Nr  r  )r  r[  rc  s    rW   __repr__IndentedBuffer.__repr__)  s     t*Qt}}/q11ry   c                    U R                   UR                   :X  d   e[        U R                   S9nUR                  U R                  5        UR                  UR                  5        U$ r  )rN  rJ  r  rM  )r  otherr   s      rW   __add__IndentedBuffer.__add__,  sK    ||u}},,,DLL9t{{#u||$
ry   c                    XR                   ;   $ r   )rM  )r  new_lines     rW   containsIndentedBuffer.contains4  s    ;;&&ry   )rN  rM  rS  Nr   )rO  r   r   r  )rS  r   r   r  )r   rG  r   r  r   r  r   r7  )r_  z)Union[LineContext, DeferredLineBase, str]r   r  )r~  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )r  r   r   'contextlib.AbstractContextManager[None])r  r   r   r  )F)r  zUnion[IndentedBuffer, str]rz  r7  r   r  )r  zCallable[[Any], Any]r   rJ  )r  r   r   rJ  )r  z)Union[DeferredLineBase, LineContext, str]r   r7  )r   r   r   r   rS  rP  r  r  rU  r`  r[  rh  rk  rn  r  ru  rt  r  r  r  r  r  r   r  r  r  r   r   ry   rW   rJ  rJ    s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"4
2'ry   rJ  c                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )FakeIndentedBufferi8  c                "   > [         TU ]  5         g r   )superrP  )r  	__class__s    rW   rP  FakeIndentedBuffer.__init__9  s    ry   c                V    US:X  a  [         R                  X5      $ [        SU S35      e)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rW   r  #FakeIndentedBuffer.__getattribute__<  s9    ;**466!$ (= =
 	
ry   r   r  )r   r  r   r   )r   r   r   r   rP  r  r   __classcell__r  s   @rW   r  r  8  s    
 
ry   r  c               #     #    [         R                  [         R                  p S v   Xs[         l        [         l        g ! Xs[         l        [         l        f = f7fr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rW   restore_stdout_stderrr  G  s9     %(ZZN@!/
CJ
CJs    A> AAAc                  h    \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSrg)rX  iP  z.A line that can be 'unwritten' at a later timec                >    UR                  5       (       d  SnXl        g rx  )rz  r_  r{  s     rW   rP  DeferredLineBase.__init__S  s    zz||D	ry   c                    [         e)zJReturns either self.line or None to indicate the line has been 'unwritten'r  rc  s    rW   r  DeferredLineBase.__call__X      !!ry   c                    [         e)z3Returns a new deferred line with the same conditionr  r{  s     rW   	_new_lineDeferredLineBase._new_line\  r  ry   c                @    U R                  U U R                   35      $ r   r  r_  )r  r  s     rW   ry  DeferredLineBase.with_prefix`  s    ~~455ry   c                T    U R                  U R                  R                  5       5      $ r   )r  r_  r  rc  s    rW   r  DeferredLineBase.lstripc  s    ~~dii..011ry   c                >    U R                  U R                  U   5      $ r   r  )r  r  s     rW   r(  DeferredLineBase.__getitem__f  s    ~~dii.//ry   c                ,    [        U R                  5      $ r   )r7  r_  rc  s    rW   rn  DeferredLineBase.__bool__i  s    DIIry   c                ,    [        U R                  5      $ r   )rR   r_  rc  s    rW   __len__DeferredLineBase.__len__l  s    499~ry   )r_  N)r_  r  )r   zUnion[str, None])r_  r  r   r   )r  r  r   r   )r   r   )r  zUnion[int, slice]r   r   r  r   r   )r   r   r   r   r   rP  r  r  ry  r  r(  rn  r  r   r   ry   rW   rX  rX  P  s-    8
""620ry   rX  c                  D   ^  \ rS rSrSrSU 4S jjrSS jrS	S jrSrU =r	$ )
DelayReplaceLineip  z6At end of codegen call `line.replace(key, value_fn())`c                <   > [         TU ]  U5        Xl        X l        g r   )r  rP  r`  value_fn)r  r`  r  r_  r  s       rW   rP  DelayReplaceLine.__init__s  s     ry   c                j    U R                   R                  U R                  U R                  5       5      $ r   )r_  replacer`  r  rc  s    rW   r  DelayReplaceLine.__call__x  s#    yy  4==?;;ry   c                D    [        U R                  U R                  U5      $ r   )r  r`  r  r{  s     rW   r  DelayReplaceLine._new_line{  s    $-->>ry   )r`  r  )r`  r  r  zCallable[[], str]r_  r  r  )r_  r  r   r  )
r   r   r   r   r   rP  r  r  r   r  r  s   @rW   r  r  p  s    @!
<? ?ry   r  c                   [        U [        R                  5      (       a  U nO[        R                  " [        5       U 5      n[        R
                  " U5      n[        R                  R                  (       aF  UR                  c   eUR                  S:  d  UR                  S:X  a  [        R                  S5        ggUR                  S:X  a  SOSnUR                  nXC:  a  [        R                  S	X4S
.S9  gg)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rq   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r{   rP   r   rX   r    createversionhipmajorr   r  r  multi_processor_count)index_or_devicer   propr  r  s        rW   
is_big_gpur    s    /5<<00 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I:%> 	 	
 ry   c                     [         R                  R                  5       (       a(  [         R                  R                  5       R                  $ [         R
                  R                  S5      R                  $ )NrH   )rP   rJ   rQ   get_device_propertiesgpu_subslice_countrH   r  r   ry   rW   get_max_num_smsr    sI    yyyy..0CCC::++F3IIIry   c                     [         R                  R                  5       (       d  g[         R                  R                  [         R                  R	                  5       5      n U R
                  S:H  $ )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rP   rH   rQ   r  r  r  )device_propertiess    rW   
using_b200r    sM     ::""$$

889R9R9TU""b((ry   c                     [         R                  R                  5       (       a
  [        5       $ [         R                  R                  5       n [        5       U b  U -
  $ S-
  $ )zFHandle experimental carveout if set otherwise return hardware SM countr   )rP   rJ   rQ   r  r
  _get_sm_carveout_experimental)carveouts    rW   get_num_smsr    sM     yy  xx557HH,@HHaHHry   c                    SSK JnJn  Uc
  [        5       nUR	                  S5      nX -  [
        -  nU" UUUUR                  " 5       S9$ )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeF)rZ  	zero_moder   
outer_name)codegen.commonr8   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr8   r  r  r  s          rW   get_tma_workspace_argr
    sU     @"}!++E2I-0CCD++-	 ry   c                   U R                   U;  a!  [        R                  SU R                   U5        [        U R                  R
                  5      =(       a+    U R                   U;   =(       a    [        U R                  5      $ )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r8  allowed_layout_dtypess     rW   _use_template_for_gpur    sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%ry   c                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf NrM  )r   ri   max_autotune_gemm_backendsr]  rz  backendrT   s     rW   _use_autotune_backendr    P    ==?!<<BBDJJ3OOa	O      Ac                    U R                  5       [        R                  R                  5       R                  S5       Vs/ s H  oR	                  5       PM     sn;   $ s  snf r  )r   ri   max_autotune_conv_backendsr]  rz  r  s     rW   _use_conv_autotune_backendr    r  r  )enable_int32enable_float8check_max_autotunec                  SSK JnJn  [        R                  [        R
                  [        R                  /nU(       a>  [        R                  [        R
                  [        R                  [        R                  /nU(       a/  UR                  [        R                  [        R                  /5        [        U R                  R                  5      =(       a    [        X5      =(       d/    U R                  R                  S:H  =(       a    U R                  U;   =(       ak    [         R"                  =(       d    [         R$                  =(       d    U(       + =(       a/    ['        S5      =(       a    U" U R                  UR(                  5      $ )Nr7   )BackendFeaturehas_backend_featurer  TRITON)r  r  r  rP   r   rG  rI  rQ  extendrA  rB  r  r   r  r  r   ri   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r8  r  r  r  r  r  layout_dtypess          rW   use_triton_templater&    s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOry   output_layout
add_guardsc                   ^^^^^ SSK Jn  SSKJm  S
U4S jjmSUU4S jjnSUU4S jjm          SUUU4S jjmU" 5       =(       a$    [	        U4S	 jU 5       5      =(       a    U" U 5      $ )u  
Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
that Triton relies on today.
* https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

A tensor is accepted when:
  * 2 ≤ rank ≤ 5
  * dtype ∈ {FP16, BF16, FP8-E4M3FN}
  * Every logical size ≥ 2
  * Base pointer 16-byte aligned
  * All "outer" dims have 16-byte aligned strides
  * The “inner” dim has stride 1 (contiguous)
  * For FP8 tensors, inner dim ≥ 32
r   )has_triton_tma_devicer7   ro  c                X   > TR                   R                  R                  U [        5      $ r   )rs  rt  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrp  s    rW   _alignedcan_use_tma.<locals>._aligned  s     ww<<ZWWry   c                   > U c  gU R                   nU R                  nU R                  nT" U R                  5      (       d  gT" XUSS9$ )NTFallow_float32)r  rB  r   r  )r8  sizesstridesr   r0  _is_tma_compatibles       rW   _is_tma_compatible_layout.can_use_tma.<locals>._is_tma_compatible_layout  sI    >-- &&!%%tLLry   c                   > U R                  5       nU R                  5       nU R                  5       nU R                  5       TR                  R
                  ;   a  gT" XUSS9$ )NFr3  )get_size
get_stride	get_dtyper  rs  unaligned_buffers)r  r5  r6  r   rp  r7  s       rW   _is_tma_compatible_matrix.can_use_tma.<locals>._is_tma_compatible_matrix(  sP    

,,. ::<177444!%%uMMry   c                |  > [        U 5      nUR                  nUS:  d  US:  a  gU[        R                  [        R                  [        R
                  4;  a  U(       a  U[        R                  :w  a  gT(       aK  TR                  R                  R                  U 5      nTR                  R                  R                  U5      nOjU  Vs/ s H(  nTR                  R                  R                  U5      PM*     nnU V	s/ s H(  n	TR                  R                  R                  U	5      PM*     nn	[        U4S jU 5       5      (       a  g[        U5       V
V	s/ s H4  u  pTR                  R                  R                  U	S5      (       d  M2  U
PM6     nn
n	[        U5      S:w  a  gUS   n[        U5       H  u  pX:X  a  M  T" X-  5      (       a  M    g   Xl   nT" X-  5      (       d  gU[        R
                  :X  a,  TR                  R                  R                  US5      (       d  ggs  snf s  sn	f s  sn	n
f )	Nr  r   Fc              3  z   >#    U  H0  nTR                   R                  R                  US 5      (       + v   M2     g7fr  N)rs  rt  statically_known_geq)r   r   rp  s     rW   r   :can_use_tma.<locals>._is_tma_compatible.<locals>.<genexpr>N  s.     P1177##88A>>>s   8;r7   r       T)rR   itemsizerP   r   rG  rA  rI  rs  rt  guard_int_seqsymbolic_hintr  r   statically_known_equalsrD  )r5  r6  r   r4  rankrG  sizes_i	strides_ir   str   r  	inner_idx	inner_dimrp  r0  r)  s                 rW   r7  'can_use_tma.<locals>._is_tma_compatible3  s    5z>> !8tax 8K8KLL%--!7gg&&44U;G((66w?IBGH%Qqww''55a8%GHFMNg))77;gIN PPPP
 #9-
-ww77A> - 	 

 u:?!H	 y)EA~BM**	 * &		,-- E'''0@0@0U0Ur1
 1
 G IN
s   /H. /H31H8H8c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r  r?  s     rW   r   can_use_tma.<locals>.<genexpr>q  s     ?h)!,,h   )r/  Union[int, sympy.Expr]r   r7  )r8  Optional[Layout]r   r7  )r  r@   r   r7  )
r5  Sequence[sympy.Expr]r6  zSequence[_IntLike]r   torch.dtyper4  r7  r   r7  )torch.utils._tritonr+  rr  rp  r   )	r(  r)  matricesr+  r8  rp  r0  r7  r?  s	    `   @@@@rW   can_use_tmar[    s    " :XM M	N 	N:#:#: : 	:
 
: :z 	 	5?h??	5%m4ry   )r)  c                    [         R                  R                  (       a  U OS n[        S U 5       5      =(       a,    [	        X#US.6=(       a    [         R                  R
                  $ )Nc              3  Z   #    U  H!  n[        UR                  5       5      S :H  v   M#     g7frC  )rR   r;  )r   r  s     rW   r   *use_triton_tma_template.<locals>.<genexpr>{  s      5HqC

"Hs   )+r'  )ri   r  enable_template_tma_storer   r[  enable_persistent_tma_matmul)r(  r)  rZ  r8  s       rW   use_triton_tma_templatera  v  sK     %mmEE]4F5H55 	7JO	7MM66ry   c                f    [        X US.6(       d  gSSKJn  SSKJn  U" 5       =(       a    U" 5       $ )Nr'  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)ra  rY  rc  codegen.cuda.cuda_envre  )r(  r)  rZ  rc  re  s        rW   !use_triton_blackwell_tma_templaterg    s5     #	:  IC 12U7S7UUry   c                     X;   =(       a    X;   $ r   r   )scale_option_ascale_option_bscaling_typess      rW   use_triton_scaling_templaterl    s    
 *N~/NNry   )maxsizec                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)zCheck if CuTeDSL is importable; cache the result for reuse.

Call ensure_cute_available.cache_clear() after installing CuTeDSL
in the same interpreter to retry the import.
cutlassNF	importlibutil	find_specr  r   ry   rW   ensure_cute_availablert    s3    ~~''	2$>>     # 
00c                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
in the same interpreter to retry the import.
cutlass_apiNFrp  r   ry   rW   "ensure_nv_universal_gemm_availablerx    s3    ~~''6dBB ru  c                 f     [         R                  R                  S5      SL$ ! [         a     gf = f)a3  Check if nvMatmulHeuristics is importable; cache the result for reuse.

nvMatmulHeuristics provides performance model-based kernel selection
for NVIDIA GEMM operations.

Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
nvMatmulHeuristics in the same interpreter to retry the import.
nvMatmulHeuristicsNFrp  r   ry   rW   $ensure_nvmatmul_heuristics_availabler{    s4    ~~''(<=TII ru  c                   [        5       (       d  g[        S5      (       d  gSSKJn  [	        UR
                  R                  5      (       d  gU" 5       (       d  g[        R                  /n	[        X)5      (       d  g[        R                  (       d  [        R                  (       d  g[        XUS9(       d  g[        S X4 5       5      (       a  gU(       a  U(       a  gUc  gUc  Ub  gg)a  
Returns True if we can use the blackwell kernel for grouped mm.
Required conditions:
    1. CuTeDSL backend is enabled
    2. CuTeDSL is available
    3. We are on a blackwell arch
    4. The dtype is bf16
    5. Max autotune or max autotune gemm is enabled
    6. A, B, and the output are 16B aligned
    7. We are not using dynamic shapes
    8. A is 2d
    9. B is 3d
    10. Offsets are provided
    11. Bias and Scale are not provided
FCUTEDSLr7   rd  )r(  c              3  8   #    U  H  n[        U5      v   M     g 7fr   
is_dynamicr  s     rW   r   3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>       
1.Q:a==.r   T)rt  r  rf  re  r  r   r  rP   rG  r  ri   r"  r#  r[  r  )
mat_amat_br8  a_is_2db_is_2doffsbiasscale_resultre  r%  s
             rW    use_blackwell_cutedsl_grouped_mmr    s    2 !"" ++C&--$$%%'))^^$M 776#;#; u6:

15.
111g|<3ry   c                r   SSK Jn  UR                  R                  R	                  X-  U-  SS9nUS::  d  U[
        R                  R                  :  a  gSSKJ	n  [        R                  R                  (       a  g[        R                  [        R                  [        R                  /n[!        X5      =(       a9    [
        R"                  =(       d    [
        R$                  =(       a    ['        S5      nU(       a;  U" 5       (       d/  [(        R+                  S	[
        R                  R,                  5        gU$ )
Nr7   ro  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rr  rp  rs  rt  	size_hintri   rH   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rP   r  r  r   rG  rQ  r  r"  r#  r  r   r  cutlass_dir)	r8  r  r$  r  rp  	gemm_sizer  r%  r   s	            rW   use_cutlass_templater    s      **1519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mf4 	-  <F$<$<	-!),  !##KK4 ''	 Jry   _IntLikec                  ^
 [        5       (       d  g[        5       (       d  g[        S5      (       d  gSSKJm
  T
R
                  (       a  gU R                  R                  S:w  d  [        R                  R                  (       a  g[        R                  [        R                  /n[        X5      (       d  g[        R                  (       d  [        R                   (       d  g[#        S XE4 5       5      (       a  g[#        U
4S jXE4 5       5      (       a  gT
R$                  R&                  R)                  [*        R,                  " US-  S	5      5      (       d  gT
R$                  R&                  R)                  [*        R,                  " US-  S	5      5      (       d  gUR/                  5       nUR/                  5       nUR0                   HO  n	U	S:w  d  M  T
R$                  R&                  R)                  [*        R,                  " U	S-  S	5      5      (       a  MO    g   UR0                   HO  n	U	S:w  d  M  T
R$                  R&                  R)                  [*        R,                  " U	S-  S	5      5      (       a  MO    g   g
)a  
Returns True if we can use the NVIDIA Universal GEMM kernel for gemm.
Required conditions:
    1. NVGEMM backend is enabled
    2. cutlass_api is available
    3. We are on a NVIDIA GPU
    4. The dtype is fp16 or bf16
    5. Max autotune or max autotune gemm is enabled
    6. We are not using dynamic shapes
    7. A and B base pointers are 16B aligned
    8. n and k are divisible by 16
    9. Non-unit strides are divisible by 16
    10. Not in AOT Inductor mode (requires runtime JIT compilation)
FNVGEMMr7   ro  rH   c              3  8   #    U  H  n[        U5      v   M     g 7fr   r  r  s     rW   r   1use_nv_universal_gemm_template.<locals>.<genexpr>O  r  r   c              3  p   >#    U  H+  oR                  5       TR                  R                  ;   v   M-     g 7fr   )r  rs  r>  )r   r  rp  s     rW   r   r  R  s$     
Mn::<177444ns   36rq   r   T)rt  rx  r  rr  rp  aot_compilationr   r  rP   r  r  r   rG  r  ri   r"  r#  r  rs  rt  statically_known_truer|   Eqr5  rB  )r8  r  r$  r  r  r  r%  a_layoutb_layoutrB  rp  s             @rW   use_nv_universal_gemm_templater  %  s   " !""-// **}}V#u}}'8'8]]ENN3M 776#;#; 
15.
111

Mun
MMM 7711%((1r612EFF7711%((1r612EFF!H!H//Q;77##99%((6B;PQ:RSS "
 //Q;77##99%((6B;PQ:RSS "
 ry   c                    [         R                  R                  R                  5       nUS:X  a  gU R                  5       UR	                  S5       Vs/ s H  o"R                  5       PM     sn;   $ s  snf )z8Check if CUTLASS should be used for the given operation.ALLTrM  )ri   rH   cutlass_enabled_opsr   r]  rz  )op_nameenabled_opsrT   s      rW   _use_cutlass_for_opr  k  sY    ++11779Ke==?+2C2CC2HI2HQwwy2HIIIIs   A0r   c           
        SSK Jn  [        R                  R                  U-  nUR
                  R                  R                  [        R                  " [        R                  " X%U -  5      [        R                  " X%U-  5      5      5      =(       aa    UR
                  R                  (       + =(       a?    UR
                  R                  (       + =(       a    [        R                  R                  S:  $ )Nr   ro  )torch._inductor.virtualizedrp  ri   r  decompose_k_thresholdrs  rt  r  r|   AndGeaot_modecpp_wrappernum_decompose_k_splits)r  r$  r  threshold_multiplerp  r  s         rW   use_decompose_k_choicer  v  s     ."MM??BTT 	
..IIA56A56	
 	5    	5 ###	5 MM0014
ry   c           
        [         R                  R                  nSSKJn  [        [        R                  R                  5      =(       a    UR                  R                  R                  [        R                  " [        R                  " X#U -  5      [        R                  " X#U-  5      5      5      =(       a=    UR                  R                  (       + =(       a    UR                  R                   (       + $ )z
Check if we should use the contiguous subgraph transform.
This transform makes the second matrix contiguous before the matmul.
r   ro  )ri   rocmcontiguous_thresholdr  rp  r7  rP   r  r  rs  rt  r  r|   r  r  r  r  )r  r$  r  r  rp  s        rW   use_contiguousr    s     ";;;; . 	U]] 	$GG22II145145
	$    	$ ###
ry   c                6   [         R                  R                  n/ SQn[        U[        R
                  5      (       a  UR                  (       d  U$ US:X  a  / $ [        U [        R
                  5      (       a  U R                  (       a0  [        U[        R
                  5      (       a  UR                  (       d  SnO[        X -  X!-  5      nSn[        R                  " U5      nU Vs/ s H  nX::  d  M
  X:  d  M  UPM     nn/ / / pn	U H`  nX,-  nUS:  a  M  XS-
  -  S:X  a  US:  a  U	R                  U5        M3  US-  S:X  a  U
R                  U5        MO  UR                  U5        Mb     [         R                  S:X  a  X-   U-   $ X-   U-   nUS U $ s  snf )	N)rq   rF  rs   rr      r   r  r  rr   r7   rF  
EXHAUSTIVE)ri   r  r  r{   r|   r1  	is_numberr  divisorsr  max_autotune_gemm_search_space)r  r$  r  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rW   get_k_splitsr    s    ]]99N .!UZZ  	1		1ejj!!!++1ejj!!!++!&!&)K~~a H  G! 	&-&< 	   =?B> 3; AI!#$$Q'RZ1_%%a( !!!$ " ,,< 5FF#8>IK''=s   (	F5F<Fc                T    [         R                  R                  U 5      R                  $ r   )rP   rH   r  gcnArchNamer   s    rW   _rocm_native_device_arch_namer    s    ::++F3???ry   c                      SS K n SSKJnJn  SSKJn  [        R                  R                  U R                  5      nXAX#4$ ! [         a    SS jnSS jn " S S5      nS n N&f = f)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     / $ r   r   r   ry   rW   r  *try_import_ck_lib.<locals>.gen_ops_library      Iry   c                     / $ r   r   r   ry   rW   r  .try_import_ck_lib.<locals>.gen_ops_preselected  r  ry   c                      \ rS rSrSrg)*try_import_ck_lib.<locals>.CKGemmOperationi  r   N)r   r   r   r   r   r   ry   rW   r  r    s    ry   r  )r   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r  r  dirname__file__r  )r  r  r  r  package_dirnames        rW   try_import_ck_libr    sh    	
	
 ''//+*>*>? -@QQ  			 	 s   ;A  A$#A$c                J   [         R                  (       d  [         R                  (       d  g[        R                  R
                  (       d  gU R                  R                  S:w  a  g[        U R                  5      n[         R                  R                   Vs0 s H  o"R                  S5      S   U_M     sn=(       d    UR                  S5      S   U0nUR                  5       [         R                  R                  -   Vs/ s H  nX2   PM	     nnU(       d  gU R                  [        R                  [        R                   [        R"                  4;  a  g[%        5       u  n    nU(       d  [&        R)                  S5        gU[         R                  l        gs  snf s  snf )NFrH   :r   z,Please pip install Composable Kernel packageT)ri   r"  r#  rP   r  r  r   r  r  r  archr]  rU  ck_supported_archr   r   rG  rI  r  r   r  ck_dir)r8  native_archr  requested_archsrequested_supported_archsck_package_dirnamer   s          rW   use_ck_templater    s@   6#;#;==}}V# 0>K39;;3C3CD3Cawws|A)3CD #q!;IO
 !%%'&++*G*GG!GA 	G  ! %||EMM5>>5==II"3"51aBC+FKK+ E!s   FF c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   ro  CKr   r  r   rr  rp  r  r  rs  rt  r  r8  r  r$  r  rp  s        rW   use_ck_gemm_templater  $	  sP     	d# 	CF#	CGG&&quqy2&>Bry   c                    SSK Jn  [        S5      =(       a>    [        U 5      =(       a,    UR                  R
                  R                  X-  U-  SS9S:  $ )Nr7   ro  CKTILEr   r  r   r  r  s        rW   use_ck_tile_gemm_templater  .	  sP     	h' 	CF#	CGG&&quqy2&>Bry   c                <    [        S5      =(       a    [        U 5      $ )Nr  )r  r  r8  s    rW   use_ck_conv_templater  8	  s    %d+G0GGry   c                    [         R                  =(       d    [         R                  =(       a    U R                  R                  S:H  $ r  )ri   r"  r#  r   r  r  s    rW   _use_template_for_cpur  <	  s2    7v77&
--


%&ry   c                   SSK Jn  [        UR                  U5      (       d   eUR                  R                  nUR                  R
                  n[        U 5      =(       al    UR                  5       [        R                  :H  =(       aD    [        U5      S:H  =(       a/    [        U5      S:H  =(       a    US   US   :H  =(       a    US   S:H  n[        XUSS9=(       a#    UR                  R                  5       =(       d    U$ )Nr7   )rA      r  F)require_constant_mat2)r  rA   r{   r8  r  rB  r  r=  rP   rI  rR   use_cpp_gemm_templateis_contiguous)r8  mat1mat2rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rW   use_cpp_bmm_templater  B	  s     dkk6****
   I++$$Kf% 	"NN-	"^q 	" "	" ^y|+		"
 ^q  " !t5Q !!#D'Dry   c                   SSK Jn  SSKJn  SSKJn	  SSKJn
  [        U 5      (       a  [        S5      (       d  g[        R                  R                  (       d  gUR                  5       [        R                  [        R                   4;   n[        R"                  [        R$                  [        R&                  [        R                  [        R                   /nU
" UUU(       a  U R(                  OS UUS9u  ppp[+        X45      (       a  g[-        X'R.                  5      (       a  UR1                  5       nU	" UR                  5       5      u  nnU" S	UUUUR                  5       UR                  5       U[3        5       U(       + US
9
nSS jnU R(                  U;   =(       aT    US L=(       aI    U" U5      =(       a:    [-        X'R4                  5      =(       a    UR7                  5       =(       d    U(       + $ )Nr7   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    U R                  5         U R                  5       S   S:H  $ )Nr   r7   )freeze_layoutr<  rT   s    rW   is_last_dim_stride12use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s"    	||~b!Q&&ry   )rT   r@   r   r7  )r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  ri   cppweight_prepackr=  rP   rX  rM  rI  rG  halfr   has_free_symbolsr{   BaseViewunwrap_viewparallel_num_threadsr3  is_module_buffer)r8  r  r  r  r  is_woq_int4r  r  r  r  r  	int8_gemmr%  r  r$  r  r  r   r  r  s                       rW   r  r  [	  s    9M) ((0Ee0L0L::$$ U[[%**$==I]]ENNEJJUZZXM")"+&,,'#A!T $$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C t]]+	C ""$A,A(Ary   c                 ~    [         R                  =(       d    [         R                  (       + =(       d    [        S5      $ )NATEN)ri   r"  r#  r  r   ry   rW   use_aten_gemm_kernelsr#  	  s-    7v77 '	v	&'ry   c                  b    \ rS rSr% \R
                  " S5      rS\S'   S
S jrS
S jr	SS jr
Srg	)DebugDirManageri	  r   r  prev_debug_namec                @    [        [        R                  5      U l        g r   )r  r%  counterr   rc  s    rW   rP  DebugDirManager.__init__	  s    ../ry   c                    [         R                  R                  R                  U l        U R                   SU R
                   3U l        U R                  [         R                  R                  l        g )N_tmp_)rP   _dynamori   debug_dir_rootr&  r   new_namerc  s    rW   	__enter__DebugDirManager.__enter__	  sM    $}}33BB//0dggY?.2mm+ry   c                    [         R                  " U R                  5        U R                  [        R
                  R                  l        g r   )r  r  r.  r&  rP   r,  ri   r-  )r  r   s     rW   __exit__DebugDirManager.__exit__	  s*    dmm$.2.B.B+ry   )r   r.  r&  Nr  )r   r   r   r  )r   r   r   r   ry  rZ  r(  r   rP  r/  r2  r   r   ry   rW   r%  r%  	  s&    ooa G0<
Cry   r%  c                  ^ SSK Jn  [        5       mSU4S jjn[        R                  R                  USU5         [        R                  R                  5         U " U0 UD6nS S S 5        W[        T5      4$ ! , (       d  f       N= f)Nr7   r<   c                (   > TR                  U 5        g r   )rW  codesource_codess    rW   save_output_code*run_and_get_code.<locals>.save_output_code	  s    ry   r9  r7  r  r   r  )
rs  r=   r#   r   r  r  rP   r,  resetr_  )r   r   r  r=   r9  r  r8  s         @rW   run_and_get_coder=  	  so    
 %$.LL 
		=*<>N	OT$V$ 
P 4%%% 
P	Os   'A77
Bc                   UR                  SS5      n[        U /UQ70 UD6u  pE/ nU HU  nUR                  [        R                  " SU[        R
                  5      5        U(       d  MA  U Vs/ s H  oSS PM	     nnMW     XF4$ s  snf )Nremove_quoteFz	'''.*?'''r  )rS   r=  r!  r   findallDOTALL)	r   r   r  r?  r  r8  kernelsr7  r  s	            rW   run_and_get_kernelsrD  	  s     ::ne4L+B@@@FGrzz,bii@A<29:'a|'G:G  ? ;s   -Bc                *   ^  SU 4S jjn[        U5      $ )Nc                 R   > T" 5       n U R                  5       R                  5         U $ r   )r  r  )r  r   s    rW   run_with_backward1run_fw_bw_and_get_code.<locals>.run_with_backward	  s!    

ry   )r   r   )r=  )r   rG  s   ` rW   run_fw_bw_and_get_coderI  	  s    
 -..ry   c                t  ^^ SSK Jn  / mSU4S jjmS	U4S jjn[        R                  R	                  USU5         [        R                  R	                  UST5         [
        R                  R                  5         U " U0 UD6nSSS5        SSS5        T$ ! , (       d  f       N= f! , (       d  f       T$ = f)
zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   c                (   > TR                  U 5        g r   r  r6  s    rW   r9  "get_code.<locals>.save_output_code	  s    D!ry   c                   >  " S S5      nU R                   (       a  U R                  5       OU R                  5       u  p#T" UR                  5        U(       a  T" UR                  5        U" 5       $ )Nc                  ,    \ rS rSrSrSS jrSS jrSrg)	@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulei	  z4This is empty to replace the generated triton modulec                    g r   r   rc  s    rW   rP  Iget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__	  s    ry   c                    g r   r   r  s      rW   callEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call	  s    ry   r   Nr  r   r   r  r   r   r  )r   r   r   r   r   rP  rT  r   r   ry   rW   DummyModulerP  	  s    Fry   rW  )r  codegen_with_cpp_wrappercodegenr   )r  rW  wrapper_codekernel_coder9  s       rW   patched_compile_to_module+get_code.<locals>.patched_compile_to_module	  s[    	 	 04/?/?D))+T\\^ 	" 	++,[../}ry   compile_to_moduler9  Nr;  )r  r=   r   r   )rs  r=   r   r  r  rP   r,  r<  )r   r   r  r=   r\  r   r9  r8  s         @@rW   get_coder_  	  s    $ L", 	

.0I	
 	

-);=MN	 	O	
  	ON	
 	
 s#   "B('BB(
B%	!B((
B7c                    [        U /UQ70 UD6nS[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ Nr7   r  z%expected one or two code outputs got r   )r_  rR   )r   r   r  r8  s       rW   get_triton_coderb  
  sQ    B000LL!&Q& 
/L0A/BC& ?ry   c                    [        U /UQ70 UD6u  p4S[        U5      s=::  a  S::  d  O   S[        U5       35       eUS   $ ra  )r=  rR   )r   r   r  r   r8  s        rW   run_and_get_triton_coderd  
  sU     'r;D;F;OAL!&Q& 
/L0A/BC& ?ry   c                   ^^^ SSK Jm  SSKJn  UR                  m/ mSUUU4S jjn[
        R                  R                  USU5         U " U0 UD6nS S S 5        UT4$ ! , (       d  f       WT4$ = f)Nr   r<   rD   c                 h   > T" U 0 UD6  U S   n[        UT5      (       d   eTR                  U5        g )Nr  )r{   r  )r   r  rs  r=   graph_lowerings	real_inits      rW   	fake_init-run_and_get_graph_lowering.<locals>.fake_init%
  s:    4"6"Q%////u%ry   rP  rV  )torch._inductor.graphr=   torch._inductor.output_coderE   rP  r   r  r  )	r   r   r  rE   ri  r  r=   rg  rh  s	         @@@rW   run_and_get_graph_loweringrm  
  sv     4;((IO& & 
		?J		BT$V$ 
C ?"" 
C	B ?""s   		A
A/c              #     #    SSK Jn  UR                  U    n [        R                  " X5      UR                  U '   Sv   X2R                  U '   g! X2R                  U '   f = f7f)zs
Override the lowering of aten_op with override_fn.
The first argument of override_fn is the original lowering fn.
r   )loweringN)torch._inductorro  	loweringsr  partial)aten_opoverride_fnro  orig_fns       rW   override_loweringrv  1
  sY      )  )G.&/&7&7&M7#&-7#g7#s   A"'A  A"AA"c                   ^ ^^ SSK Jn  UR                  mSUUU 4S jjn[        R                  R
                  R                  USU5      $ )zf
Add hook functions to be called at the beginning and end of Scheduler.__init__.
Used for unit tests.
r   )	Schedulerc                F   > T" X5        T" X5      nT(       a  T" X5        U$ r   r   )r  rQ  outru  post_fnpre_fns      rW   r  (add_scheduler_init_hook.<locals>.wrapperN
  s%    y i'I%
ry   rP  )r  r   rQ  r   r   r   )torch._inductor.schedulerrx  rP  unittestr   r  r  )r|  r{  rx  r  ru  s   ``  @rW   add_scheduler_init_hookr  C
  s>     4  G  ==%%iWEEry   c                    [         R                  (       a  [        R                  U 5        g[        R	                  U 5        g)z
Warnings that will be actionable for PyTorch developers, but not
end users.  Allows us to easily disable them in stable releases but
keep them on for nightly builds.
N)ri   developer_warningsr   r  info)msgs    rW   developer_warningr  X
  s$       Cry   c                     [         R                  R                  S5      n U S-   [        [         R                  5      :  aV  [        [         R                  U S-      5      S:  a3  [         R                  U S-      S   S:w  a  [         R                  U S-      $ [         R                   H)  nUR                  S5      (       d  M  U[        S5      S s  $    g! [         a     NJf = f)a  
An experimental API used only when config.benchmark_kernel is true.

The benchmark name is only available at codegen time. So we can not
directly call it in benchmark_all_kernels which is run after codegen.

The function assumes the argument after --only is the benchmark name.
It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
scripts, this function may return None.

There are 2 flavors of --only argument we need handle:
1. --only model_name
2. --only=model_name
z--onlyr7   r   r  z--only=N)r  argvr  rR   
ValueErrorr  )r^  r  s     rW   get_benchmark_namer  d
  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx>>)$$s9~'((    s   BC 
C"!C"c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7fr7   Nr   r  s     rW   r   is_ones.<locals>.<genexpr>
       %u!Avu   r   rV  s    rW   is_onesr  
      %u%%%ry   c                &    [        S U  5       5      $ )Nc              3  *   #    U  H	  oS :H  v   M     g7f)r   Nr   r  s     rW   r   is_zeros.<locals>.<genexpr>
  r  r  r  r  s    rW   is_zerosr  
  r  ry   c                &    [        S U  5       5      $ )Nc              3     #    U  HI  n[        U[        R                  5      (       d  M$  UR                  [        R                  " S 5      :H  v   MK     g7f)r  N)r{   rP   r  r   )r   r   s     rW   r    is_cpu_device.<locals>.<genexpr>
  s9      DdELL) 	+u||E**s
   #A*Ar  )inputss    rW   is_cpu_devicer  
  s       ry   c                    [        U [        R                  5      (       d   S5       eU R                  (       a  [        R
                  $ [        R                  $ )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r{   r|   r1  r   rP   rS  rK  rk  s    rW   get_sympy_Expr_dtyper  
  s@    c5::&& B& ~~{{}}ry   c              /     #    U (       a.  [         R                  R                  " U0 UD6 nUv   S S S 5        g S v   g ! , (       d  f       g = f7fr   )rP   r   r   )should_profiler   r  r   s       rW   maybe_profiler  
  s;     ^^##T4V4G 54 	 54s   (A=A
AAc                 p    [         R                  R                  n U S:  a  [        R                  " 5       n U $ Nr7   )ri   r  threadsrP   get_num_threads)r  s    rW   r  r  
  s+    jj  G{'')Nry   c                     SSK Jn   U " 5       nUR                  S[        R                  R
                  (       a  S5      $ S5      $ )Nr7   )get_backend_options
num_stagesr  r  )runtime.triton_helpersr  rS  rP   r  r  )r  optionss     rW   get_backend_num_stagesr  
  s2    ;!#G;;|%--*;*;QCCCCry   c                L   [        U [        R                  R                  R                  R
                  S9nUb  U$ SSKJnJn  [        R                  R                  5       =(       a!    [        R                  R                  5       S:  nU [        R                  [        R                  [        R                  4;   d   e[        R                  " U5      R                   R#                  S5      (       a  SSKJn  U" 5       nU [        R                  [        R                  4;   a  U(       a  U" X5      $ [        R                  R                  R                  R
                  (       a  U" [        R                  U5      $ U" [        R                  U5      $ U [        R                  [        R                  4;   a  U(       a  U" U 5      $ [        R                  R                  R                  R
                  (       a  U" [        R                  5      $ U" [        R                  5      $ )z
We don't want to throw errors in this function. First check to see if the device is in device_info.py,
then fall back to the inaccurate triton estimation.
)is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rt   r   
clock_rate)max_clock_rate)r   rP   backendsrH   matmul
allow_tf32triton.testingr  r  rQ   get_device_capabilityr   rG  rI  inspect	signature
parametersrS  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rW   get_device_tflopsr  
  sl    UENN,?,?,F,F,Q,QRGM**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\JJ8!#U]]ENN33,U==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55ry   c                     SSK Jn   U " 5       $ )Nr   get_dram_gbps)r  r  r  s    rW   get_gpu_dram_gbpsr  
  s    ,?ry   c                 x    SSK Jn   U R                  R                  R	                  S5      R                  SS5      $ )Nr   r  max_shared_mem)triton.runtimer  r  r  r  rS  r  s    rW   get_gpu_shared_memoryr  
  s.    %==44Q7;;<LaPPry   c                     [         R                  R                  5       (       aT  [         R                  R                  5       R                  n [         R                  R                  5       R
                  nX-  $ Sn SnX-  $ )NrF  i   )rP   rH   rQ   r  	warp_sizemax_threads_per_block)r  r  s     rW   get_max_numwarpsr  
  sg    zz  JJ446@@	 %

 @ @ B X X
 !-- 	 $ --ry   c                $    U R                  S5      $ )Nwelford)r  reduction_types    rW   is_welford_reductionr  
  s    $$Y//ry   c                4    [        U 5      (       a  gU S:X  a  gg)Nr  online_softmax_reducer  r7   )r  r  s    rW   reduction_num_outputsr  
  s    N++	2	2ry   c                 2    [         R                  " 5       S:H  $ )NLinux)platformsystemr   ry   rW   is_linuxr    s    ??''ry   c                 (    [         R                  S:H  $ )Nrk   )r  r  r   ry   rW   r  r  	  s    <<7""ry   c                &    [        S U  5       5      $ )Nc              3     #    U  H7  n[        U[        R                  5      =(       a    UR                  (       + v   M9     g 7fr   )r{   r|   r1  r  r  s     rW   r   #has_free_symbols.<locals>.<genexpr>  s)     Jcz!UZZ(<_<cs   ?Ar  )itrs    rW   r  r    s    JcJJJry   c            	        SSK Jn  U  H  n[        X!R                  UR                  UR
                  UR                  UR                  45      (       aR  [        UR                  5       =(       d    S5      (       d'  [        UR                  5       =(       d    S5      (       a    gM  [        X!R                  5      (       d  M  [        S[        U5       35      e   g)Nr7   r  r   Tzunexpected type for is_dynamic F)r  r  r{   r1  r3  r  rs  r>   r  maybe_get_sizemaybe_get_strider@   	TypeErrorr  )r   r  ts      rW   r  r    s    bmmR[[":K:KRYYW
 
   0 0 2 8b99=M""$*> > > Ayy))=d1gYGHH  ry   c                      \ rS rSrSrSrSrg)Placeholderi%  KERNEL_NAMEDESCRIPTIVE_NAMEr   N)r   r   r   r   r  r  r   r   ry   rW   r  r  %  s      K *ry   r  c                v   SSK Jn  [        R                  " SSS9 n[        R
                  " 5       n[        R
                  " 5       n[        U[        U5      S9R                  " U6   [        SUR                   3US9  [        UR                  US9  [        R                  " 5       n[        X5         U " UR                  5        S S S 5        [        R                  " 5       U-
  n	U" UR                  5        UR                  R                  5         UR                  5         [        S	UR                   3US9  [        UR                  US9  UR!                  5       UR!                  5       :H  n
["        R%                  S
UUR&                  U
U	5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr7   )stable_topological_sortrh  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior   r^   rZ   	propagater  rs  r   nowr]   lint	recompiler[  r   r  r   )r  r  inpr  r  r#  	before_ioafter_io
start_timetime_elapsedr  s              rW   pass_execution_and_saver  /  sE    9		$	$
 
KKM	;;=R#3C#89CCSI	"(($1-bhhY'\\^
#B,N -||~
2)


#!,bhhX& H$5$5$77hFF	
+
 
 -,
 
s%   BF*2FCF*
F'	#F**
F8c                    SSK Jn  [        XR                  5      =(       a     [        U R                  UR
                  5      $ )z:
Check if input buffer is a multi-outputs template buffer
r7   r  )r  r  r{   CppTemplateBufferr8  MultiOutputLayout	input_bufr  s     rW   is_multi_outputs_templater  R  s7     i!5!56 :"..< ry   c                    SSK Jn  [        XR                  5      =(       a7    [	        U R
                  5      S:H  =(       a    [        U R
                  S   5      $ )zD
Check if input buffer is a output of multi-outputs template buffer
r7   r  r   )r  r  r{   MultiOutputrR   r  r  r  s     rW   #is_output_of_multi_outputs_templater  ]  sJ      	9nn- 	;	  !Q&	;%i&6&6q&9:ry   c                   U c  gSSK Jn  [        XR                  5      =(       a:    [        XR                  5      (       + =(       a    US L =(       d    U R
                  UL =(       Gd_    [        U 5      UR                  L =(       Ga@    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       d    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  =(       df    [        [        R                  R                  S5      =(       a;    U R
                  [        R                  R                  R                  R                  :H  $ )NFr7   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r{   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr  rP   r  torchrecr  defaultr  r  ri  r  r  s      rW   is_collectiver  l  sM    | 	4--. 	3400	34Z14++r1  	T
b''' 	
 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/ry   c                <    SSK Jn  [        U 5      UR                  L $ Nr7   r  )r  r  r  r  )ri  r  s     rW   is_waitr    s    :''ry   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      =(       a    US L =(       d    U" U 5      $ )Nr   GroupedSchedulerNodec              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_collectiver  s     rW   r   &contains_collective.<locals>.<genexpr>  s     @<a&q))<r   )r~  r  r{   r  snodesr  ri  )snode	filter_fnr  s      rW   r  r    sJ     ?%..@5<<@@@$P)t*;*Oy?OPry   c                    SSK Jn  [        X5      (       a  [        S U R                   5       5      $ [        U R                  5      $ )Nr   r  c              3  8   #    U  H  n[        U5      v   M     g 7fr   )contains_waitr  s     rW   r    contains_wait.<locals>.<genexpr>  s     :\=##\r   )r~  r  r{   r  r  r  ri  )r  r  s     rW   r  r    s4    >%..:U\\:::uzz""ry   c                    SSK Jn  [        U[        R                  R
                  5      (       a  U/n[        XR                  5      =(       a    U R                  U;   $ r  )r  r  r{   rP   r  r  r
  r	  r  s      rW   is_fallback_opr     sF     "ejj++,,Td--.I43C3Cr3IIry   c                @    X!U    R                   R                  5          $ r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rW   buf_name_to_fused_snoder&    s!     (3??HHJKKry   c                    gr  r   r  s    rW   r  r        ury   c           	         U" U 5      (       a  g UR                  U 5        U R                   H-  n[        UR                  X#5      nXa;   a  M   [	        UUUUUS9  M/     g )Ncriteria_cb)rW  unmet_dependenciesr&  r   find_recursive_deps_of_node)r  collected_node_setr$  r%  r,  depdefining_op_for_deps          rW   r.  r.    sf     55!''5HHk
 4##	
 (ry   c                    gr  r   r(  s    rW   r  r    r)  ry   c           
        U" U 5      (       a  g UR                  U 5        U R                  5        H  nUR                   H  nUR                  c   eUR                  R	                  5       S:X  a  M2  UR                  R	                  5       U;  a  MR  X6R                  R	                  5          nXq;   a  Mu  [        UUUUUS9  M     M     g )NOUTPUTr+  )rW  get_outputsr  ri  r  find_recursive_users_of_node)r  r/  r$  r%  r,  oro  user_ops           rW   r6  r6    s     55! GGD99(((yy!!#x/yy!!#+==(););)=>G,(""'  !ry   c                j    [         R                  R                  R                  (       a  SOSnX-
  U-
  $ )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rP   
_functorchri   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rW   num_fw_fixed_argumentsr?    s3     $$::   69SSSry   c                   SS jnSn/ nU R                   R                   H8  nUR                  S:X  d  M  U" U5      (       a  UR                  U5        US-  nM:     U[	        [        [        U5      5      5      :X  d   e[        U5      $ )z6
Infers which inputs are static for a backwards graph
c                    SU R                   ;  =(       a;    SU R                   ;  =(       a%    SU R                   ;  =(       a    SU R                   ;  $ )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r  s    rW   is_saved_tensor'count_tangents.<locals>.is_saved_tensor  sH    aff$ .!&&(.!/.  qvv-		
ry   r   r  r7   )rT   r5   r   r7  )rs  rQ  r  r  r_  r   rR   )fx_grF  	arg_countstatic_arg_idxsr$  s        rW   count_tangentsrK    s    

 IOZZ44= q!!&&y1NI	  d5_)=#>????ry   c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
	BoxedBooli  r7  r   c                    U R                   $ r   )r   rc  s    rW   rn  BoxedBool.__bool__  s    zzry   c                @    [        U [        5      (       a	  SU l        U $ gr  )r{   rM  r   r  s    rW   disableBoxedBool.disable  s    c9%%CIJry   r   Nr  )r  r   r   zUnion[BoxedBool, bool])	r   r   r   r   r   rn  r  rQ  r   r   ry   rW   rM  rM    s     K  ry   rM  c              #     ^ ^#    SSK Jn  UR                  m   S             SU U4S jjjn[        R                  R                  USU5         S v   S S S 5        g ! , (       d  f       g = f7f)Nr7   r9   c                :   > TR                  U5        T" XX#XE5      $ r   rL  )r  kernel_namer[  ra  gpucpp_definitionkernel_listorig_define_kernels         rW   define_kernel.collect_defined_kernels.<locals>.define_kernel-  s'     	;'!{c
 	
ry   rZ  )NTN)r  r:   rU  r  r[  r  ra  Optional[str]rV  r7  rW  r\  r   r   )codegen.wrapperr:   rZ  r   r  r  )rX  r:   rZ  rY  s   `  @rW   collect_defined_kernelsr^  '  s     5-;; #'(,
"

 
  	

 
 &
 

 
 
		/-	P 
Q	P	Ps   AA2A!	A2!
A/+A2c                    U S-   $ )N__original__r   r  s    rW    get_cloned_parameter_buffer_namera  >  s    .  ry   c                    U [         ;   $ r   )rN   r  s    rW   r  r  B  s    Yry   c                0    U S:g  =(       a    [        U 5      $ )NrI   )r  r  s    rW   device_need_guardrd  F  s    U?-vf~-ry   c                h   U [         R                  :X  aD  [         R                  R                  5       (       a!  [         R                  R	                  5       S:  $ U [         R                  :X  a$  [         R
                  R                  5       (       a  gU [         R                  [         R                  4;   $ )N)r  r   T)rP   rG  rH   rQ   r  rJ   rS  r7  r?  s    rW   ,needs_fallback_due_to_atomic_add_limitationsrf  J  sq    5::#:#:#<#<zz//1F::	%..	 UYY%;%;%=%=ejj111ry   c                   U R                   [        R                  R                  R                  [        R                  R                  R
                  4;   a  Uc  gU R                   [        R                  R                  R                  :X  a  SOSnUS U4;  =(       Gd&    U=(       a    [        U5      =(       a    [        U5      =(       d    U R                   [        R                  R                  R                  :H  =(       ap    US:H  =(       ad    U=(       a[    US:H  =(       aO    [        R                  R                  =(       a.    [        R                  R                  =(       d    [        5       S:g  =(       dJ    X:H  =(       a#    U[        R                  [        R                  4;   =(       d    [        R                   " 5       $ )NFrW  r  r  r7   )overloadpacketrP   r  atenscatter_reduce_scatter_reducescatter_r  rf  ri   r  fallback_scatter_reduce_sumdynamic_threadsr  r7  rS  $are_deterministic_algorithms_enabled)r	  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rW   use_scatter_fallbackru  S  s]    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 'SJ5::u{{:S,S	8 557!ry   c                   SSK JnJn  SSKJn  [        S[        U 5       S35        [        U 5       GH.  u  pE[        SUS S35        XRL a  [        S	5        M'  XQL a  [        S
5        M8  [        XS5      (       a  UR                  5       n[        U(       a  SOS S35        U(       a;  UR                  c   e[        SUR                  R                  R                   35        [        S5        UR                  R                   H  n[        U5        M     [        S5        UR                  R                   H  n[        U5        M     GM  [!        S[#        U5       35      e   g)z
An API that can be used in pdb to dump a node_schedule.
Right mainly dump the read/write dependencies but can add more as needed.
r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr.  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrw  rx  r~  ry  r  rR   r   r{   is_reductionri  r2  reduction_hintrH  rI  rJ  r   r  )r  rw  rx  ry  r^  ri  is_redr0  s           rW   dump_node_scheduler  z  s&   
 O7	M 236
:;}-	#al"$%%%&,,&&(FfU$/?@yy,,,01N1N0OPQ*''--c
 .+''..c
 / !9$t*FGG' .ry   c                z    SSK Jn  U" U R                  5       [        U R                  5      -  [
        -  S:H  5      $ )Nr   )r  )r  r  storage_offsetrA  r   GPU_ALIGN_BYTES)r   r  s     rW   tensor_is_alignedr    s:     L 				 >&,,#?	??RVWW ry   c                    [        U R                  R                  5      (       d  g[        R                  =(       d    [        U 5      $ r  )r  r   r  ri   assume_aligned_inputsr  )example_inputs    rW   should_assume_input_alignedr    s5     -&&++,,''K+<]+KKry   c                 X   [         R                  R                  R                  5       n U (       d  [        R
                  " 5       $ U R                  (       a  U R                  R                  (       d  [        R
                  " 5       $ U R                  R                  nUR                  5       $ r   )	rP   _guardsTracingContexttry_getr  nullcontextr  ru  suppress_guards)tracing_contextru  s     rW   #maybe_get_suppress_shape_guards_ctxr    sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&ry   c                "   [         R                  R                  R                  [        SS5         [
        R                  R                  5         SS KnSS K	nUR                  " 5       nUR                  " U5      nSSKJn  UR                  U5        UR                  nUR!                  UR"                  5        U " U0 UD6n	UR%                  5       n
UR!                  U5        UR'                  U5        S S S 5        X4$ ! , (       d  f       W	W
4$ = f)Nr   Tr   )output_code_log)r  r   r  r  ri   rP   r,  r<  r  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr[  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rW   run_and_get_cpp_coder    s     
			#	#FGT	:[[]""#56=""2&$**
  /T$V$'')  ,%%b) 
;  9! 
;	:  19s   CC==
Dc                :   [        U 5      nUb  UR                  $ U  H  n[        U[        R                  5      (       a  UR
                  R                  s  $ [        U[        R                  5      (       d  M[  UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    UR                  5        H<  n[        U[        R                  5      (       d  M$  UR
                  R                  s  s  $    M     g r   )	rZ   ru  r{   rP   r2   ri  r  r  rB  )r  r  inputr  rB  s        rW   shape_env_from_inputsr    s     (I """ eU\\**::''' eU\\**

dELL1199... %  ,,.fell33!;;000 )  ry   c                B   ^ ^^ [        T5      S:X  a  T $ SUU U4S jjnU$ )Nr   c                   > [        U TT5      u  pT" U 5      n[        U5      (       a  [        R                  " X5        U$ r   )copy_misaligned_inputsrR   rP   _foreach_copy_)
new_inputsold_tensorsnew_tensorsrz  inputs_to_checkr  mutated_input_idxss       rW   r  )align_inputs_from_check_idxs.<locals>.run  sD    #9);$
  J {  :
ry   )r  list[InputType]r   r   )rR   )r  r  r  r  s   ``` rW   align_inputs_from_check_idxsr    s(    
 ?q   Jry   c                X   SU R                  5       ;   a  SnO;[        S [        U R                  5       U R                  5       5       5       5      S-   n[        R
                  " X4S5      R                  5       n[        R
                  " X R                  5       U R                  5       5      $ )Nr   c              3  6   #    U  H  u  pUS -
  U-  v   M     g7fr  r   )r   r=  rB  s      rW   r   )clone_preserve_strides.<locals>.<genexpr>  s     T:Sf$:Ss   r7   r   )r  r  r   rB  rP   
as_stridedclone)rT   needed_sizer6  s      rW   clone_preserve_stridesr    s    AFFH} T#affh
:STTWXX 	 a6<<>FFFFHahhj99ry   c                T   / n/ nUSLnU H  nX   n[        U[        R                  5      (       d   S[        U5       35       eUR	                  5       [
        -  (       d  MW  [        U5      X'   U(       d  Mm  Xb;   d  Mt  UR                  U5        UR                  X   5        M     X44$ )z
Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
cloned tensor which is in `return_pair_idxs`.
Nz Expected tensors only, but got: )r{   rP   r  r  data_ptr	ALIGNMENTr  r  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rW   r  r    s     ')K&(K (t3}$-- 	
.tDzl;	
- ==?Y&&248JMA$9""4("":=1  ##ry   c                    / nU HV  nX   n[        U[        R                  5      (       d  M(  UR                  5       [        -  S:X  d  ME  UR                  U5        MX     [        U5      [        U5      :w  a  U$ U$ )zO
We require all inputs to be aligned, so introduce a copy for any
that aren't.
r   )r{   rP   r  r  r  r  rR   )r  static_input_idxsaligned_static_input_idxsr^  r  s        rW   remove_unaligned_input_idxsr  8  sp     !# eU\\**0@90LQR/R%,,S1 ! $%->)??((ry   c                   SSK Jn  [        R                  " [        R                  5      R
                  nUR                  R                  R                  nUR                  R                  R                  R                  n[        R                  (       a&  UR                  R                  R                  X5        gUR                  R                  R                  X:*  5      (       a  gUR                  (       a.  UR                  R                  R                  U S:  5      (       a  gU" U 5      =(       a    U" U 5      U:*  $ )Nr7   ro  Tg@xDF)rr  rp  rP   iinforQ  r   rs  rt  r  ru  has_hintri   assume_32bit_indexing	check_leqr  r  )r   rp  int_maxr  r  s        rW   expr_fits_within_32bitr  J  s    kk%++&**G  **Iww))22H##	""1. 	ww--al;; 	 7711!d(;;  A;29Q<722ry   c                6  ^^^ [         R                  R                  R                  5       nUb  UR                  b  [        UR                  5      S:X  d   e[        U 5      mUR                  c   eUR                   H  nUc  UR                  R                  S 5        M#  Sm[         R                  R                  R                  5       =n(       a  UR                  mSUU4S jjmUR                  R                  [        U4S jU 5       5      5        M     g g g )Nr   Fc                r   > Tc  [        U 5      $ T(       a  TR                  U 5      $ TR                  U 5      $ r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callru  s    rW   map_expr4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88ry   c              3  4   >#    U  H  nT" U5      v   M     g 7fr   r   )r   r   r  s     rW   r   5set_tracing_context_output_strides.<locals>.<genexpr>  s     5u!(1++urT  )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rP   r  r  r  output_stridesrR   r  r  r  r  )r  compiled_graphrD  r<  r  r  r  ru  s        @@@rW   "set_tracing_context_output_stridesr  q  s     mm**224Gw55A7))*a///).9	,,888#22E}&&--d3$)!--66>>@@3@(+(=(=%9 9 &&--5u55 3	  Bry   c                 4   [         R                  b  [         R                  $ [         R                  " 5       (       d  g[        R                  R                  5       (       a  g SSKJn   U [        R                  R                  S5      :  $ ! [         a     gf = f)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
ri   fx_graph_remote_cache	is_fbcoderP   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rW    should_use_remote_fx_graph_cacher    s    ##/+++,,..H  5#8#8#M#M8$    s   "B
 

BBc                2    [         R                  " SSU 5      $ )Nz[^a-zA-Z0-9_]r   )r   subr  s    rW   normalize_namer    s    66"C..ry   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                j    [         R                  S[        U 5      5      n[        R	                  X5      $ )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r  _triton_type_mappingrS  )r   triton_type_names     rW   triton_typer    s+    &**5#e*=##$4GGry   c                    [         R                  X 5      nUR                  SS5      n[        [        U5      n[        U[        R                  5      (       d   eU$ )Nr  r  )_torch_triton_mappingrS  r  rO   rP   r{   r   )r   adjusted_type	type_namer  s       rW   triton_type_to_torchr    sM    )--e;M%%eR0Iy)Ii----ry   c                   U R                   (       + =(       a    U R                  5       UR                  5       :H  =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       a    U R                  UR                  :H  =(       ae    U R                  5       R                  5       UR                  5       R                  5       :H  =(       a!    U R                  5       UR                  5       :H  $ r   )	is_mkldnnr  rB  r   r   untyped_storager  r  r2  r   s     rW   is_same_tensorr    s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;ry   c                   U R                   =(       a    U R                  5       UR                  5       :H  =(       a    U R                  UR                  :H  =(       as    U R                  UR                  :H  =(       aS    [        R
                  R                  R                  U 5      [        R
                  R                  R                  U5      :H  $ r   )r  r  r   r   rP   r  mkldnnr  r  s     rW   is_same_mkldnn_tensorr    s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOry   c                     g)N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner	  xorr   r   ry   rW   boolean_opsr    s    ry   c                  *    \ rS rSr% S\S'   S\S'   Srg)OpDtypeRulei  r3   type_promotion_kindOptional[torch.dtype]override_return_dtyper   NrE  r   ry   rW   r  r    s    8800ry   r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                (    [        X5      [        U '   g r   )r  r  )r   r  r  s      rW   #register_op_dtype_propagation_rulesr
    s    
 (3(t$ry   zOrderedSet[str]op_requires_libdevice_fp64c                .    [         R                  U 5        g r   )r  rW  r  s    rW   #register_op_requires_libdevice_fp64r    s    ""4(ry   c                    SSK Jn  U (       d$  UR                  R                  5       R                  n U S:X  a  [
        R                  $ U S:X  a  gU S:X  a  [
        R                  $ [
        R                  $ )Nr   ro  r  rI   rJ   )	r  rp  rs  get_current_device_or_throwr  ri   cpu_backendxpu_backendcuda_backend)r   rp  s     rW   get_current_backendr  	  s_    -gg99;@@e!!!				!!!"""ry   c                    U [         R                  [         R                  4;   a=  [        R                  R
                  (       a  [        5       S:X  a  [         R                  $ U $ )z"Maybe upcast [b]float16 to float32r  )rP   r   rG  ri   r  codegen_upcast_to_fp32r  rI  r?  s    rW   upcast_compute_typer    s@     	%--00MM00!X-}}Lry   KeyTypeValTypec                  v    \ rS rSrSrSS jrSS jrSS jrSS jrSSS jjr	SS	 jr
SS
 jrSS jrSS jrSrg)
ScopedDicti'  z
A dictionary-like object that allows for scoped updates. It maintains
an original dictionary and a set of new items that can override
the original items within the scope.  The original dictionary is
unmodified.
c                    Xl         0 U l        g r   original_dict	new_items)r  r  s     rW   rP  ScopedDict.__init__/  s    *13ry   c                \    XR                   ;   a  U R                   U   $ U R                  U   $ r   r  r  r  s     rW   r(  ScopedDict.__getitem__3  s,    .. >>#&&!!#&&ry   c                     X R                   U'   g r   )r  )r  r`  r   s      rW   __setitem__ScopedDict.__setitem__8  s    #sry   c                H    XR                   ;   =(       d    XR                  ;   $ r   r!  r  s     rW   __contains__ScopedDict.__contains__;  s    nn$A/A/A(AAry   Nc                t    XR                   ;   a  U R                   U   $ U R                  R                  X5      $ r   )r  r  rS  )r  r`  r  s      rW   rS  ScopedDict.get>  s2    .. >>#&&!!%%c33ry   c                    [        U R                  5      nU R                   H  nX R                  ;  d  M  US-  nM     U$ r  )rR   r  r  )r  r$  r  s      rW   r  ScopedDict.__len__C  s<    ""#A***Q   ry   c              #     #    U R                    S h  vN   U R                   H  nXR                   ;  d  M  Uv   M     g  N-7fr   r  )r  r  s     rW   __iter__ScopedDict.__iter__J  s8     %%%%A***   	&s   AA  A
Ac                R    [        U R                  =(       d    U R                  5      $ r   )r7  r  r  rc  s    rW   rn  ScopedDict.__bool__P  s    D&&8$..99ry   c                    [         er   r  r  s     rW   __delitem__ScopedDict.__delitem__S  s    !!ry   r!  )r  Mapping[KeyType, ValType])r`  r  r   r  )r`  r  r   r  r   r  )r`  r  r   r7  r   )r`  r  r  Optional[ValType]r   r6  r  )r   zIterator[KeyType]r  )r`  r  r   r  )r   r   r   r   r   rP  r(  r$  r'  rS  r  r.  rn  r3  r   r   ry   rW   r  r  '  s5    4'
$B4
:"ry   r  )frozen_defaultc              .   ^ SU4S jjnU c  U$ U" U 5      $ )Nc                0   > [         R                  " U STS9$ )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rW   wrapir_dataclass.<locals>.wrapY  s    $$S$vFFry   )r   rl   r   rl   r   )r   r   r=  s    ` rW   ir_dataclassr?  W  s    G {9ry   c                     [         R                  R                  R                  5       n U b'  U R                  (       a  U R                  R
                  $ g r   )rP   r  r  r  fw_metadatabw_donated_idxs)r  s    rW   get_donated_idxsrC  a  s=    mm22::<O"'B'B**:::ry   c                  (    \ rS rSrSrSrSrSrSrSr	g)	TritonAttrsDescriptorVersionih  r   r7   r  r  rK  r   N)
r   r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   r   ry   rW   rE  rE  h  s     LKK	  Gry   rE  c                 f   [         R                  R                  S5      c  [        R                  $ SS Kn SS Kn [        U R                  R                  S5      (       a  [        R                  $ [        U R                  R                  S5      (       a  [        R                  $ [        R                  $ )Nr  r   AttrsDescriptor)rq  rr  rs  rE  rF  triton.backends.compilertriton.compiler.compilerr  r  compilerrH  rG  rJ  )r  s    rW   #get_triton_attrs_descriptor_versionrP  r  s    ~~)1+888##v''):;; ,777	))+<	=	=+777 ,333ry   c                 8    [        5       [        R                  :H  $ r   )rP  rE  rJ  r   ry   rW   triton_version_uses_attrs_dictrR    s    .04P4X4XXXry   c                |   SSK Jn  U R                  n[        U[        R
                  R                  5      (       d  gU[        R                  R                  R                  R                  [        R                  R                  R                  R                  [        R                  R                  R                  R                  4;   as  U" X R                  U R                  SS9nUbT  Uu  pEUS   nU HE  nUc  M  UR                  S   R                   [        R"                  [        R$                  4;   d  ME    g   g)z
Check if an FX node is cudagraph-unsafe based on its input arguments.

Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
with boolean indices triggers .nonzero() during capture, but integer indices
are safe).
r   )normalize_functionFT)normalize_to_only_use_kwargsindicesrl  )torch.fx.operator_schemasrT  r  r{   rP   r  r  r  ri  	index_putr  
index_put__unsafe_index_putr   r  r  r   r7  rX  )r  rT  r  
normalizedr   r  rV  r^  s           rW   ,_fx_node_is_input_dependent_cudagraph_unsafer\    s     =^^Ffejj3344 		  ((		!!))		((00 
 (LL'..t

 !"IAY'G?sxx'<'<JJKKA (    ry   c                   U R                   n[        U5      [        ;   a  g[        U[        R
                  R                  5      (       a3  [        R                  R                  R                  UR                  ;   a  g[        U 5      (       a  gU R                  R                  S5      =nb]  [        U[        [        45      (       d  U/OUnU H7  n[        U[        R                   5      (       d  M$  UR"                  (       d  M7    g   g)a  
Check if an FX node is cudagraph-unsafe.

This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- Input-dependent unsafe ops (e.g., index_put with boolean indices)
- Ops with sparse tensor outputs
Trl  F)r  r  FORBIDDEN_CUDAGRAPH_OPSr{   rP   r  r  r
  r  cudagraph_unsafer  r\  r  rS  r_  r  r  	is_sparse)r  r  rl  valsr   s        rW   r  r    s     ^^F 6{-- 	65::0011HHLL))V[[8 4G<< ||&&3&sT5M::uA!U\\**q{{{  ry   c                    SSK Jn  [        XR                  UR                  45      (       a  g[        XR
                  UR                  45      (       d  g[        U SS5      nUb  [        U5      (       a  gg)aH  
Returns True if the node is an op that is not cudagraphable.
This includes:
- Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
- Ops with the cudagraph_unsafe tag
- index_put_ with boolean indices (triggers .nonzero() during capture)
- Control flow nodes (Conditional, WhileLoop)
- Ops with sparse tensor outputs
r7   r  TFr  N)	r  r  r{   Conditional	WhileLoopr
  r?   rO   r  )ri  r  r  s      rW   is_cudagraph_unsafe_opre    sf      $677d..@AAdIt,G:7CCry   c                 6   [         R                  R                  SS5      n [        R                  " 5       (       a^  SSKJn  U" 5       nU(       aJ  [         R                  R                  USS5      nU (       a   [         R                  R                  X0/5      OUn U $ )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  rS  ri   r  libfb.py.parutilrh  r  r  pathsep)r  rh  runtime_pathlib_paths       rW   get_ld_library_pathrn    sh    ::>>+R0D5')ww||L)UCH8<2::??H#34(DKry   c                N    SSK Jn  [        X5      =(       a    U R                  S L$ )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperrp  r{   partition_signatures)r  rp  s     rW   #is_codegen_graph_partition_subgraphrs     s'    L 	79 	5((4ry   c                     [         R                  R                  R                  R                  =(       d    [
        R                  S L=(       a$    [         R                  R                  R                  $ r   )rP   r  ri   r  
cudagraphs&_unstable_customized_partition_wrapperr  graph_partitionr   ry   rW   is_using_cudagraph_partitionrx  	  sN    %%00 	F199E1 //
 
 
0
01ry   c                    SSK Jn  UR                  R                  R	                  U S5      (       a;  UR                  R                  R                  U S5      (       a  [        R                  $ [        R                  $ )Nr7   ro  l        i   )	rr  rp  rs  rt  statically_known_ltrD  rP   rQ  rS  )r  rp  s     rW   dtype_from_sizer{    sX    ww++e 
''


/
/h
?
?{{{{ry   )r  rJ   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN BF16.
r  rJ   TF)rP   r  r  _is_mkldnn_bf16_supportedr   s    rW   is_mkldnn_bf16_supportedr    3     eyy99;;	+	ry   c                n    U S:X  a(  [         R                  R                  R                  5       $ SU ;   a  gg)z3
Returns True if the device supports MKL-DNN FP16.
r  rJ   TF)rP   r  r  _is_mkldnn_fp16_supportedr~  s    rW   is_mkldnn_fp16_supportedr  *  r  ry   c           
     x   U Vs/ s H  n[        [        U5      5      PM     nnU  HS  n[        U5      [        U5      :X  d   e[        U5       H'  u  pR[        X5   [        [        U5      5      5      X5'   M)     MU     / nUR	                  SR                  S [        X5       5       5      5        [        U5      [        U5      S-  -   [        U5      S-
  -   nUR	                  SU-  5        U  H3  nUR	                  SR                  S [        XC5       5       5      5        M5     SR                  U5      $ s  snf )N|c              3  4   #    U  H  u  pS X  S 3v   M     g7fr.  Nr   )r   hrh  s      rW   r   tabulate_2d.<locals>.<genexpr>=  s     H3G41AaWA,3G   r  r7   r  c              3  4   #    U  H  u  pS X  S 3v   M     g7fr  r   )r   r   rh  s      rW   r   r  B  s     H7Gtq!Cl7Gr  rN  )rR   r  r   r   r  r  r   r  )elementsheadersr   widthsrowr   r~  total_widths           rW   tabulate_2dr  6  s    #*+7ac#a&k7F+3x3w<'''cNDAFIs3q6{3FI #  E	LLH3w3GHHIf+Vq1S[1_EK	LL{"#SXXHs37GHHI 99U ,s   D7c              #     #    [        U R                  5       5      [        UR                  5       5      -  nU H6  nU R                  U5      nUR                  U5      nUUb  UOUUb  UOU4v   M8     g7f)a  
Zip two dictionaries together, replacing missing keys with default values.

Args:
    dict1 (dict): The first dictionary.
    dict2 (dict): The second dictionary.
    d1_default (Any): the default value for the first dictionary
    d2_default (Any): the default value for the second dictionary

Yields:
    tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
           and the value from dict2 (or d2_default if missing).
N)r#   rU  rS  )dict1dict2
d1_default
d2_defaultall_keysr`  value1value2s           rW   	zip_dictsr  F  sp     ( %**,'*UZZ\*BBH 33 (Fj(Fj
 	
 s   A1A3c                v           SS jn        SS jnU R                  S[        R                  R                  5      nU R	                  5       n U(       aq  U" U SS5        U" U SS5        U" U S[
        R                  R                  (       + 5        U" U SS	5        U" U S
[        R                  R                  5        U" U SS5        U R                  S[        R                  R                  5      nU R                  S[        R                  R                  5      nUS:X  a  U(       a  [        S5      eU $ )a
  
Ensures the configuration is internally consistent for standalone AOTInductor.

If `aot_inductor_mode.compile_standalone` is set to True in the provided
`config_patches` (or falls back to the global config), this function ensures
that the following configs are also enabled:
    - `aot_inductor.package_cpp_only`

Args:
    config_patches (dict[str, Any]): A dictionary of user-provided config
        overrides for AOTInductor compilation.

Returns:
    dict[str, Any]: The possibly-updated `config_patches` dictionary.
c                    U R                  U[        [        U5      5      nUc  X U'   g U(       d  X2:w  a  [        SU SU S35      eg g )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)rS  rO   ri   r   config_patchesconfig_nameconfig_valuer   s       rW   patch_config2maybe_aoti_standalone_config.<locals>.patch_configz  sY     "";0LM=*6;'50";-q>qr  1ry   c                    U R                  U[        [        U5      5      nX2:w  a  [        R	                  SUU5        X U'   g )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)rS  rO   ri   r   r  r  s       rW   force_patch_config8maybe_aoti_standalone_config.<locals>.force_patch_config  sB     "";0LM KKV
 '3{#ry   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  dict[str, Any]r  r  r  r   r   r  )rS  ri   aot_inductor_modecompile_standalonecopyrP   r  r  test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor   )r  r  r  r  r  r  s         rW   maybe_aoti_standalone_configr  i  sk   "	&	58	HK			
3&
358
3HK
3	
3 (++.  33
 $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 	(,,	

 	>+I5Q*..,11
 -00.33
 	).E]
 	

 ry   c                   [         R                  R                  (       a)  [         R                  R                  S:X  a  [	        S5      e[         R                  R                  (       a0  [         R                  R
                  S:X  a  [	        S5      eSnSnX!4$ [         R                  R                  S:X  a  SnSnX!4$ U S::  a  gSn[         R                  " 5       (       + nX!4$ )	z
Decide whether we should mmap weights, and whether to store the weights with .so.

If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

Returns tuple (use_external_weights, use_mmap_weights).
binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)ri   r  force_mmap_weights package_constants_on_disk_formatr   r  r  )consts_sizeuse_mmap_weightsuse_external_weightss      rW   determine_aoti_mmap_flagsr    s     	..@@MQJ
 	

 --44	A]   $#55;;}L# #55m# !++--11ry   c                     SSK Jn   U R                  R                  nUc  g[	        U[
        5      (       d  [        S5      eUS:X  a  g[        R                  " SU5      (       d  [        S5      eg)zD
Validates if a model name is suitable for use in code generation.

r   rh   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	rp  ri   r  model_name_for_generated_filesr{   r  r  r   r   )ri   
model_names     rW   is_valid_aoti_model_namer    sn    
 '$$CCJj#&&OPPR 88/<<d
 	
 ry   c                <    U(       a  [        U 5      $ [        U 5      $ r   )r)   r(   )rT   unbacked_onlys     rW   get_free_symbolsr    s    $Q''Ary   c                 *   0 [         R                  ES[         R                  R                  S[         R                  R	                  [
        R                  5      5      0En [        R                  " 5       (       a  [        R                  " S5      U S'   U $ )z9
Get a base environment for running Python subprocesses.

PYTHONPATHTORCH_CUSTOM_PYTHONPATHr2  
PYTHONHOME)r  r  rS  rk  r  r  r  ri   r  	sysconfigget_path)envs    rW   python_subprocess_envr  	  so    

** 	bjjnn%rzzsxx'@
	C  %..v6LJry   c                  .    \ rS rSr% SrS\S'   S\S'   Srg)CUDAGraphWrapperMetadatai$  z
Metadata for Customized CUDAGraphWrapper.

Currently assumes there is 1 dynamo graph and will extend to
multiple graphs in the future.
r   num_partitionspartition_indexr   Nr   r   ry   rW   r  r  $  s      ry   r  .c                  $    \ rS rSr% SrS\S'   Srg)CUDAGraphWrapperi;  NzOptional[CUDAGraphWrapperType]r  r   )r   r   r   r   r  r   r   r   ry   rW   r  r  ;  s    .2G+2ry   r  c                    U [         l        g r   )rv  r  )r  s    rW   !set_customized_partition_wrappersr  M  s    5<*2ry   c                H  ^ U R                   R                  nU R                   R                  / UQU R                   R                  QU R                   R                  5      nU R                   R                  n[
        R                  " X45      u  p4SS jnU Vs/ s H:  nU" U5      (       a(  [        R                  R                  R                  USS9OUPM<     nnSS jmSU4S jjnU Vs/ s H
  og" U5      PM     nn[
        R                  " X45      u  pX4$ s  snf s  snf )	Nc                    [        U [        R                  R                  R                  5      =(       a3    [        U [        R                  R                  R
                  5      (       + $ r   )r{   rP   r  r  r@   GeneratorStater  s    rW   _is_tensor_ir(snode_args_kwargs.<locals>._is_tensor_irZ  sH    !U__//667 

u!!00A
 =
 	
ry   F)guard_shapec                ,    [         R                  " XUS9$ )Nr   )rP   r   )r  r   r   s      rW   _tensor"snode_args_kwargs.<locals>._tensorf  s    {{4V<<ry   c                   > [        U [        R                  5      (       d  U $ T" U R                  5       U R                  U R
                  5      nU$ r   )r{   rP   r  r  r   r   )r   rz  r  s     rW   to_real_tensor)snode_args_kwargs.<locals>.to_real_tensori  s:    !U\\**Haffh2
ry   r  )r   r  )r   r   r   r   )ri  r  fill_non_provided_argsconstant_argsr  pytreer$   rP   r  r  ir_node_to_tensortree_unflatten)	r  r   r  	flat_argsflat_args_pytree_specr  r%  r  r  s	           @rW   snode_args_kwargsr  Q  s   ::D::,,*$*))*

D ZZF'-':':D>'J$I
 	 A  	,,QE,B	 	  = -66Iq"II6((JLD<%  7s   AD,Dc                    SSK Jn  U R                  nUR                  R                  (       a(  UR	                  UR                  R                  S-   5      nUR                  S5      $ )Nr7   ro  r   )primals_r  fwd_rng_staterE  rB  )rr  rp  r   rs  removeprefixr  )r0  rp  dep_names      rW   is_nonfreeable_buffersr  t  sN    xxH 	ww||(();<I ry   c                x    [        X S3-  5       nUR                  5       sSSS5        $ ! , (       d  f       g= f)z,Load a template file and return its content.z	.py.jinjaN)openread)r   template_dirr#  s      rW   load_templater    s+    	lvY//	0Avvx 
1	0	0s   +
9c                   U R                   n[        U[        R                  R                  [        R                  R
                  45      (       d   S[        U5       35       e[        R                  (       d  g[        [        R                  R                  R                  R                  [        R                  R                  R                  R                  /5      nX;   a  g[        [        R                  R                  R                   /5      n[        U[        R                  R
                  5      (       a  X;   $ [#        U 5      (       + $ )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r{   rP   r  r  r  r  ri   fallback_by_defaultr#   r  ri  _assert_scalarr  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr!   )ri  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rW   should_fallback_by_defaultr    s    [[F&&

(F(FG  O	?V~NO  %% *4IINN))11IINN**22	
*& 3 				@	@AM &%**8899&&&t,,,ry   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    U [         ;   $ )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rW   is_collective_opr    s    n$$ry   c                 p    [         R                  " 5       (       a	   SSKJn   U $ / $ ! [         a    / s $ f = f)Nr   tlx_only_cuda_options)ri   r  )torch._inductor.fb.tlx_templates.registryr  r  r  s    rW   r  r    s<    	W(( 		  	I	s   & 55r  )rw   r   r   r   )r   r   r   r7  )   d   )r   Callable[[], Any]r   r   r   r   r   r  )r  r  F)
r   r  r   r   r   r   r   r7  r   r  r  )r   z"Union[Optional[torch.device], str]r   torch.device)r!  zIterable[sympy.Expr]r   r   )r)  rW  r*  rW  r   r   )r!  zIterable[_T]r   zValuesView[_T])r4  rU  r5  rU  r   rU  )r`  r  r   r  )rf  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])rl  Union[int, torch.SymInt]r   rU  )r   rU  r   r  )rf  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])r  torch._ops.OpOverloadr   r7  )r  r5   r  z'Callable[[torch._ops.OpOverload], bool]r   r7  )r  r   r   r  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rH   )r   r  r   r  )r7   rH   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r  r   r  )r   r  r  g      ?rH   )r  r  r  r  r   r   r  r   r  r  r   r  r   r  )r  r   r  r  r   r  )r  r   r  r   r   r  )r%  r   r&  r   r   r   )rT   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rT   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  r  r   z*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r  )r  r
  r  r:   r   ztuple[str, str]r   )rl  zIterable[torch.fx.Node]rm  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])r   zSequence[IRNode]r  zdict[str, IRNode]r   r  r  )r  r   r   zValueRanges[Any])r  r  r   r7  )r  re   r^  r   r   r  )r  r7  r   r7  )r   r  r   r  )rj  r   r  zdict[sympy.Expr, Any]r   r   )r%  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])r   r   r   r7  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r  r   r5   )r  r  r   zOrderedSet[torch.device]r  )r  r   r   r   )NNT)r  zOptional[dict[str, Any]]r  r\  r   r7  r   r  )r)  r  r%  r7  r   	list[int])ru  r+   r)  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r%  r7  r   r  )r   rX  r   r   r  r  )r  zUnion[int, torch.device]r   r7  r  )r  r   r   r  r	  Optional[int]r   r8   )r8  rA   r  zlist[torch.dtype]r   r7  )r  r  r   r7  )
r8  rA   r  r7  r  r7  r  r7  r   r7  )rZ  r@   r(  rV  r)  r7  r   r7  )rZ  r@   r(  rA   r)  r7  r   r7  )ri  r6   rj  r6   rk  zlist[ScalingType]r   r7  )r  r   r  r   r8  rA   r  r7  r  r7  r  Optional[Any]r  r  r  r  r   r7  )
r8  rA   r  r   r$  r   r  r   r   r7  )r8  rA   r  r  r$  r  r  r  r  r@   r  r@   r   r7  )r  r  r   r7  r   )
r  r  r$  r  r  r  r  r   r   r7  )r  r  r$  r  r  r  r   r7  )r  r  r$  r  r  r  r   r  )r   r  r   r  )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r8  rA   r   r7  )r8  rA   r  zUnion[ReinterpretView, Buffer]r  r@   r   r7  )FTFN)r8  rA   r  r@   r  r@   r  r7  r  r7  r  r7  r  r  r   r7  )r   Callable[P, _T]r   r  r  r  r   ztuple[_T, list[str]])r   r  r   ztuple[Any, list[str]])r   r  r   r  r  r  r   r   )r   r  r   r  r  r  r   r  )r   r  r   r  r  r  r   ztuple[Any, list[GraphLowering]])rs  r  rt  r  r   r  )r|  r  r{  zOptional[Callable[..., Any]]r   r   )r  r  r   r  )r   r\  )rV  r  r   r7  )r  zSequence[torch.Tensor]r   r7  )rl  r   r   rX  )r  r7  r   r   r  r   r   zIterator[Any])r   rX  r   r  )r  r  r   r7  )r  r  r   r   )r  zIterable[Any]r   r7  )
r  r  r  r4   r  r  r  r  r   r  )r  z"Optional[Union[Buffer, Operation]]r   r7  )ri  z Optional[Union[Node, Operation]]r  z!Optional[torch._ops.OperatorBase]r   r7  )ri  z"Optional[Union[IRNode, Operation]]r   r7  )r  rF   r  z-Optional[Callable[[BaseSchedulerNode], bool]]r   r7  )r  rF   r   r7  )ri  zOptional[Operation]r  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r7  )r#  r  r$  r  r%  r  r   r   )r  rF   r/  zMutableSet[BaseSchedulerNode]r$  zdict[str, SchedulerBuffer]r%  zdict[str, BaseSchedulerNode]r,  zCallable[[Any], bool]r   r  )r<  r   r=  r   r   r   )rH  r  r   r   )rX  r   r   r  )r   r  r   r  )r   r\  r   r7  )r   r  r   r7  )r   rX  r   r7  )r	  r  r  r\  rp  rX  rq  rX  rr  r  rs  r7  r   r7  )r  r  r   r  )r   r  r   r7  )r  r  r   r7  )r   r  )r   r  r   r  r  r  r   ztuple[_T, str])r  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r  r	  r  zOrderedSet[int]r   r  )rT   r  r   r  )r  r  r  r	  r  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  r  r  r	  r   r	  )r   r   r   r7  )r  r  r  rE   r   r  )r   rX  r   r  )r   r  r   rX  )r2  r  r   r  r   r7  )r   ztuple[str, ...])r   r  r  r3   r  r  r   r  )r   r  r   r  )r   r\  r   r  )r   rX  r   rX  )r   zOptional[type[Any]]r   r7  r   r   )r   zOptional[list[int]])r   rE  )r  torch.fx.Noder   r7  )ri  rB   r   r7  )r  r:   r   r7  )r  r   r   rX  )r   r  r   r7  )r  zSequence[Sequence[T]]r  zSequence[T]r   r  )NN)
r  r5  r  r5  r  ValType | Noner  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )r  r   r   ztuple[bool, bool])rT   r*   r  r7  r   zOrderedSet[sympy.Symbol])r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r  rF   r   z tuple[list[Any], dict[str, Any]])r0  r;   r   r7  )r   r  r  r/   r   r  )ri  r  r   r7  )r   r   (  
__future__r   rO  r  r;  enumr  rq  r  r  ry  r  r  r  r  r  r   r  r   r  r  r  r  r  r  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r|   rP   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr    !torch.fx.passes.regional_inductorr!   torch.utils._dtype_abbrsr"   torch.utils._ordered_setr#   r$   r%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   torch.nn.functionalr6   r  r8   r]  r:   dependenciesr;   rs  r=   r  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   rN   rL   r   rX   torch._dynamo.device_interfacerY   torch._dynamo.utilsrZ   torch.autogradr[   torch.autograd.profiler_utilr\   (torch.fx.passes.graph_transform_observerr]   torch.fx.passes.shape_propr^   torch.utils._sympy.functionsr_   r`   ra   rb   rc   torch.utils._sympy.symbolrd   re   torch.utils._sympy.value_rangesrf   rg   r  ri   runtime.runtime_utilsrj   r3  _IS_WINDOWS	getLoggerr   r   rl   r  r1  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr  r  r.  r  rv   rx   r   Functionr   r<  r   r   r   r   r  r  r"  r+  r.  rc  rg  rm  rw  ry  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r!  rj  rp  r  r  r  r  r  r  r  r  r  r  	frozensetr^  r  r  r  r  r  r  r   r   r  r  r$  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher-  r=  rA  rC  rG  rJ  r  r  rX  r  r  r  r  r  r
  r  r  r  r&  r[  ra  rg  rl  rt  rx  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#  r%  r=  rD  rI  r_  rb  rd  rm  rv  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  Enumr  r  r  r  r  r  r  r  r   r&  r.  r6  r?  rK  rM  r^  ra  r  rd  rf  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rV  r  compiler  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r  r?  rC  rE  rP  rR  r\  r  re  rn  rs  rx  r{  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyper  r  rv  r  r  r  r  r  r  r  r  )r  r   s   00rW   <module>rF     s   "        	     	  	   
              C B    $ $ ? : E 0 / ; ($ 
  >>//C$"/,5!$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%! T]UZZ'(	U5<<ell:;<	 Ebii(I7S   (!4E3F0GH 	 {Q'A-+2B XDX XB5
LENN  d#  $"G GX #(	 
 !	
 4 #(	]]] 
] !	]
 ]@  ;@
+*"*+A**#AL+	+++	"/	)/#/G @OI	I<I 
I0 *8+0' ' 	!  	
 ( %'!  	
    )'#$  cNTT"
;sAv&*
+E8WQU^ E:)++/+\C,4).4)O4) 	4)nW2CW2!W2 W2x 48*0 (G
G$5GG:,^%	DU	>2- $ $'& 
< !# I "	 .29+9	9 9 	9 9z !5 $ "  49  ( 	$$	7$ 	$
 $N Q7 7*  , , ,
S' S'l
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
< :>RWpp&6pKOp	ph BG&,:>	 BGVV&,V:>V	V OOO %O 
	O Q	  	 Q	  	 Q  >>> > 	>
 > > >  > 
>BBCCC$,C19CBHCQWC	CLJ CO,) , EF!)?B	 (  . 5( 5(p @ @ R R:"JH&8@F	: ""&"&==
= = 	=
  = =  = 
=@'C C"&&& & 	&$ &2:/(V		 &	2:		## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D %6 %6P  Q.0(#K(*$)) * 

 
"- 
4A 
HK 
	 
F1	" -1#
*#)# 
#L( @D	Q	Q<	Q 
	Q#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.2$&$!$ $ 	$
 $ $ 
$NH>L'  &2:2:*" ( %	0	: 37$$$$ 0$ 3	$<$ $$3N!3B	:&/ '#)* $%
  +?*D*D*FG*F$!*FG  **Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)# # )

)
-" 01 -"` D)t   *499  4 42Y!H"J4
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
  
FRj&2R66 d#  $ 38$./@ 3 3 *:); &= F
"-L 
%
  e Hs   %u