
    ȅiwE                     P   S SK r S SKrS SKrS SKJr  S SK JrJr  S SKJr  S SK	J
r
  S SKJrJrJrJr  S SKJrJrJr  S SKrS SKJs  Jr  S SKJs  Jr  S SKJrJr  S S	K J!r!  \RD                  RG                  \$S
5      r%\RL                  =(       a    \RN                  RQ                  5       r&Sr)\" S5      r*\" S5      r+S\S\4   S\S\4   4S jr,SS jr-S\\\\*4   \+4   S\\\\*4   \+4   4S jr. " S S5      r/ " S S\/5      r0 " S S\05      r1\&(       a  \1" 5       r2g\0" 5       r2g)    N)Callable)cached_propertywraps)chain)median)AnyConcatenateOptionalUnion)	ParamSpecSelfTypeVar)countersdynamo_timed)	DebugModebenchmarkingi  PTfn.returnc           
        ^ ^^ SSK Jm  TR                  R                  S:X  a  T $ S[        [
           [        [
        S4   -  [
        -  S[        [
           [        [
        S4   -  [
        -  4UU4S jjm[        R                  " T 5      S[        [           S	[        [        [        4   S[        [
           [        [
        S4   -  [
        -  4UU 4S
 jj5       nU$ )Nr   )config ms.r   c                 B  > [        U [        [        45      (       a  [        U 5      " U4S jU  5       5      $ TR                  R
                  n[        U [        5      (       d   eUS:X  a  U (       a  SU -  $ S$ US:X  a  SS KnUR                  5       $ [        SU 35      e)Nc              3   4   >#    U  H  nT" U5      v   M     g 7fN ).0valdistorts     ^/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/runtime/benchmarking.py	<genexpr>Cmay_distort_benchmarking_result.<locals>.distort.<locals>.<genexpr>(   s     7BSGCLLBs   inverseg      ?g        randomr   zUnrecognized distort method )	
isinstancelisttupletypetest_configsdistort_benchmarking_resultfloatr&   RuntimeError)r   distort_methodr&   r   r!   s      r"   r!   0may_distort_benchmarking_result.<locals>.distort$   s     b4-((87B777,,HH"e$$$$Y&!38*s*x'==?"!=n=MNOO    argskwargsc                  $   > T" U 0 UD6nT" U5      $ r   r   )r2   r3   r   r!   r   s      r"   wrapper0may_distort_benchmarking_result.<locals>.wrapper5   s       r{r1   )torch._inductorr   r+   r,   r(   r-   r)   	functoolsr   r   dictstr)r   r5   r   r!   s   ` @@r"   may_distort_benchmarking_resultr;      s    &66"<	PK%s
++e3P	euUCZ(	(5	0P P" __RCy$(cN	euUCZ(	(5	0  Nr1   c                  l    [         R                  R                  R                  (       a  [	        S5      eg )Na  In the deterministic mode of Inductor, we will avoid those
        benchmarkings that would cause non deterministic results. Only benchmarkings in the vetted
        scenarios are allowed. Example include autotuning for triton configs of pointwise kernels.

        When you see this exception, you can do one of the following two things:
        1. if the benchmarking you are doing does not introduce any non-determinism, you can just
        add is_vetted_benchmarking=True to you benchmark_gpu call. That would solve the issue.

        2. if the benchmarking you are doing indeed introduces non-determinism, you'll need to disable
        such feature in deterministic mode or find an alternative implementation that is deterministic.
        )torch	_inductorr   deterministicr.   r   r1   r"   may_ban_benchmarkingr@   @   s,    ++ 
 
 
	 ,r1   c           	         ^  [        T 5      S[        S[        R                  S[        R                  S[
        4U 4S jj5       nU$ )zWraps `fn` with `dynamo_timed` context, and increments the appropriate dynamo
counters. It is expected that `fn` is a method of `Benchmarker` or one of its
subclasses; typing limitations prevent us from declaring this directly.
selfr2   r3   r   c                    > U R                   R                   STR                   3n[        S   SU 3==   S-  ss'   [        USS9   T" U /UQ70 UD6sS S S 5        $ ! , (       d  f       g = f)N.inductorzbenchmarking.   F)log_pt2_compile_event)	__class____name__r   r   )rB   r2   r3   fn_qual_namer   s       r"   r5   time_and_count.<locals>.wrapperW   sg    ..112!BKK=A}\N;<A<,eDd,T,V, EDDs   A
A,)r   r   r   r2   r3   r   )r   r5   s   ` r"   time_and_countrL   O   sF     2Y-c -!&& -AHH - - - Nr1   c                   8   \ rS rSrSrS\SS4S jrS\S\S\R                  4S	 jr
\   SS\S
\S\4   S\\\S4      S\\\\4      S\\\\R                  4      S\S\4S jj5       r\ SS\S\/ \4   S\S\S\4
S jj5       r\S\S\S\S\4S j5       rSrg)Benchmarkera   zc
A device-agnostic benchmarking utility for measuring the runtime of
inductor generated callables.
rB   r   Nc                     g r   r   )rB   s    r"   __init__Benchmarker.__init__g   s    r1   fn_args	fn_kwargsc                 :   S n[        XR                  5       5       Hm  n[        R                  " U5       HP  n[	        U[
        R                  5      (       d  M$  Uc  UR                  nM5  UR                  U:w  d  MG  [        S5      e   Mo     Uc  [        S5      eU$ )NzcCan't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!zCan't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`. Use a direct benchmarking method instead e.g. `Benchmarker.benchmark_cpu` or `Benchmarker.benchmark_gpu`.)	r   valuespytreetree_leavesr'   r=   Tensordevice
ValueError)rB   rS   rT   inferred_devicearg_or_kwargarg_or_kwarg_leafs         r"   infer_deviceBenchmarker.infer_devicej   s    26!'+;+;+=>L &,%7%7%E!!"3U\\BB"*&7&>&>O&--@$}  &F ? "N  r1   r   .rZ   r3   c                 :  ^^^ SnUb.  [        U[        5      (       a  [        R                  " U5      OUnOATc  Tc  [	        S5      eT=(       d
    [        5       mT=(       d    0 mU R                  " T0 TD6n[        U[        R                  5      (       d   eT=(       d
    [        5       mT=(       d    0 m[        T5      S:X  a  [        T5      S:X  a  TnOUUU4S jnUR                  S[        R                  5      nUR                  S[        R                  5      n	[        R                  " 5          U[        R                  " S5      :X  a  U R                  " U4XS.UD6sSSS5        $ U R                  " U4XS.UD6sSSS5        $ ! , (       d  f       g= f)	a  Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
actual runtime calculation is dictated by the benchmarking implementation, but may be
one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
`ValueError(...)` if we can't safely infer the device type of `fn`; for example,
if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
types are found. To bypass device inference, provide the device to the `device`
parameter.

WARNING: if `fn` mutates `fn_args` or `fn_kwargs`, benchmarking may fail unexpectedly.
For example, if `fn` clears a mutable object, subsequent invocations of `fn` during
benchmarking will fail. In such cases, `fn` should handle cloning its arguments internally.
If device inference is required, `Benchmarker.infer_device` can be used prior to calling
this method without any arguments for `fn_args` and `fn_kwargs`.

Arguments:
- fn: The function to benchmark.
- fn_args: The function's arguments.
- fn_kwargs: The function's kwargs.

Keyword Arguments:
- device: Which device to use for benchmarking. If not provided the device will be attempted
to be inferred from `fn_args` and `fn_kwargs`.
- **kwargs: The benchmarking implementation's kwargs.

Returns:
- The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
NzJ`fn_args` and `fn_kwargs` cannot both be None if `device` is not provided.r   c                     > T " T0 TD6$ r   r   )r   rS   rT   s   r"   <lambda>'Benchmarker.benchmark.<locals>.<lambda>   s    G 9y 9r1   warmuprepcpu)re   rf   )r'   r:   r=   rZ   r[   r)   r_   lenpopinductor_config inductor_default_autotune_warmupinductor_default_autotune_repr   _benchmarking_inductorbenchmark_cpubenchmark_gpu)
rB   r   rS   rT   rZ   r3   r\   	_callablere   rf   s
    ```      r"   	benchmarkBenchmarker.benchmark   sX   J 37(263(?(?V$V  9#4 `  (G!RI"//FIFO/5<<8888$UWO	 w<1Y1!4I9IHo&V&VWjj M MN --/%,,u"55)))VFVvV 0/ %%iRR6R 0//s   5/F.F
Frp   re   rf   c                 l   ^ S[         S[        [           4U4S jjnU" U5        [        U" U5      5      $ )a  Benchmark the CPU callable, `_callable`, and return the median runtime,
in milliseconds.

Arguments:
- _callable: The CPU callable to benchmark.

Keyword Arguments:
- warmup: Optionally, the duration, in milliseconds, to run `_callable`
before benchmarking starts.
- rep: Optionally, the duration, in milliseconds, to run `_callable`
during benchmarking.

Returns:
- The median runtime of `_callable`, in milliseconds.
r   r   c                    > / n[         R                  " 5       n [         R                  " 5       nT" 5         [         R                  " 5       nUR                  XC-
  [        -  5        XB-
  [        -  U :  a   U$ M_  r   )timeperf_counterappendMILLISECONDS_PER_SECOND)r   timingsrun_start_tstart_tend_trp   s        r"   run_for*Benchmarker.benchmark_cpu.<locals>.run_for   sl    G++-K++-))+3JJK(,CCrIN r1   )intr(   r-   r   )rB   rp   re   rf   r}   s    `   r"   rn   Benchmarker.benchmark_cpu   s2    (
	 
	U 
	 	gcl##r1   r2   c                     [         er   )NotImplementedError)rB   r2   r3   s      r"   ro   Benchmarker.benchmark_gpu   s    !!r1   r   )NNN)   d   )rI   
__module____qualname____firstlineno____doc__r   rQ   r   r=   rZ   r_   rL   r   r
   r)   r9   r:   r   r-   rq   r   rn   ro   __static_attributes__r   r1   r"   rN   rN   a   s[   
t  S s u|| 0  .2.259HSHSS#XHS %S/*HS DcN+	HS
 sELL012HS HS 
HS HST OR $ $'C0 $:= $IL $	 $  $D "D " " " " "r1   rN   c                   |    \ rS rSr\S\S\S\4   4S j5       r\	\
 SS\S\/ \4   S\S\S\4
S	 jj5       5       rS
rg)TritonBenchmarker   rB   r   .c                 P     SSK Jn  U$ ! [         a  n[        S5      UeSnAff = f)z"Lazily import Triton's `do_bench`.r   )do_benchzrequires TritonN)triton.testingr   ImportErrorr   )rB   r   es      r"   triton_do_bench!TritonBenchmarker.triton_do_bench   s4    	@/   	@%&78a?	@s   
 
% %rp   is_vetted_benchmarkingr3   c                 b   U(       d
  [        5         [        R                  " U R                  5      R                  n[        UR                  5       5       H  nXT;  d  M
  X5	 M     SU;   a  U R                  " U40 UD6S   $ SU;   a  U R                  " U40 UD6$ U R                  " U40 UDSS0D6$ )a  Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.

Arguments:
- _callable: The GPU callable to benchmark.

Keyword Arguments:
- quantiles: Optionally, a tuple of floats denoting the requested quantiles.
- return_mode: Optionally, the requested return mode. Currently, Triton's
`do_bench` supports min, max, mean, and median return modes.
- **kwargs: Additional kwargs passed to Triton's `do_bench`.

Returns:
- The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
this is the requested return mode. Otherwise, this is the median.
	quantilesr   return_moder   )r@   inspect	signaturer   
parametersr(   keys)rB   rp   r   r3   do_bench_paramskwargs         r"   ro   TritonBenchmarker.benchmark_gpu   s    2 & "!++D,@,@ALL&++-(E+M ) & ''	<V<Q??f$''	<V<<##INNXNNr1   r   N)F)rI   r   r   r   r   r   r   r   r   r;   rL   boolr-   ro   r   r   r1   r"   r   r      s    d xS'9   %
 (-!O!OBG$!O !%!O 	!O
 
!O  %!Or1   r   c                      \ rS rSr\S\S\4S j5       rS\S\S\\	\
R                  R                  \
R                  R                  4      4S jrS\S\\	\
R                  R                  \
R                  R                  4      S\4S jr\\       SS\S
\/ \4   S\S\S\S\S\S\\
R*                     S	-  S\S\S\\\   -  4S jj5       5       rSrg	)InductorBenchmarkeri&  rB   r   c                     [         R                  R                  5       n[         R                  R                  U5      nUR                  $ )z7Get the L2 cache size, in bytes, of the current device.)r=   cudacurrent_deviceget_device_propertiesL2_cache_size)rB   rZ   propss      r"   r   !InductorBenchmarker.L2_cache_size'  s6     **,

008"""r1   itersc                     [        U5       Vs/ s H=  n[        R                  R                  SS9[        R                  R                  SS94PM?     sn$ s  snf )z!Get `iters` pairs of CUDA events.T)enable_timing)ranger=   r   Event)rB   r   _s      r"   get_event_pairs#InductorBenchmarker.get_event_pairs.  s]     5\

 " 

  t 4

  t 4 "
 	
 
s   AAevent_pairsc           	      l    [        U VVs/ s H  u  p#UR                  U5      PM     snn5      $ s  snnf )zIGet the minimum timing, in milliseconds, for a group of CUDA event pairs.)minelapsed_time)rB   r   start_event	end_events       r"   get_event_pairs_min_timing.InductorBenchmarker.get_event_pairs_min_timing:  sA      /:.9*K ((3.9
 	
s   0
Nrp   estimation_itersmemory_warmup_itersbenchmark_itersmax_benchmark_durationr   grad_to_noner   r3   c	           	         U(       d
  [        5         [        R                  R                  5         U" 5         [        R                  R                  5         [        R                  " U R
                  S-  [        R                  SS9n
U
R                  5         U R                  U5      nU HO  u  pUb  U H
  nSUl	        M     U
R                  5         UR                  5         U" 5         UR                  5         MQ     [        R                  R                  5         U R                  U5      n[        [        U[        X_-  5      5      S5      n[        U5       H  nU
R                  5         M     U R                  U5      nU HO  u  pUb  U H
  nSUl	        M     U
R                  5         UR                  5         U" 5         UR                  5         MQ     [        R                  R                  5         A
US:X  a'  U VVs/ s H  u  pUR                  U5      PM     nnnU$ US:X  a  U R                  U5      n[        UU5      $ [!        SU S	35      es  snnf )
aM  Benchmark a GPU callable using a custom benchmarking implementation.

Arguments:
- _callable: The callable to benchmark.

Keyword Arguments:
- estimation_iters: Optionally, the number of iterations to run `_callable`
during runtime estimation.
- memory_warmup_iters: Optionally, the number of iterations to flush the L2
cache before starting benchmarking.
- benchmark_iters: Optionally, the number of iterations to run `_callable`
during the benchmarking.
- max_benchmark_duration: Optionally, the maximum duration of the benchmarking,
in milliseconds. An estimated duration is calculated based on the values
of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
runtime of `_callable` and various other factors, and we then shrink
`benchmark_iters` to fit in the allotted maximum duration.
- return_mode: Return mode for benchmark results. Options are "min" (default),
"all" (returns all measurements).
- grad_to_none: Optionally, a list of tensors whose gradients should be cleared
before each benchmark iteration.
- is_vetted_benchmarking: in deterministic mode, we only allow
benchmarking in vetted cases.
- **kwargs: Additional kwargs that may be passed to the fallback.

Returns:
- If return_mode="min": The minimum runtime of `_callable`, in milliseconds.
- If return_mode="all": List of all runtime measurements, in milliseconds.
   r   )dtyperZ   NrF   allr   zUnsupported return_mode: z. Use 'min' or 'all'.)r@   r=   r   synchronizeemptyr   r   zero_r   gradrecordr   maxr   r   r   r[   )rB   rp   r   r   r   r   r   r   r   r3   bufferr   r   r   xestimated_timingr   all_timingsbenchmarked_timings                      r"   ro   !InductorBenchmarker.benchmark_gpuE  s   X & " 	

  	

  T//14EIIfU **+;<&1"K'%A!AF &LLN K '2 	

 ::;G %;%O!PQST

 *+ALLN , **?;&1"K'%A!AF &LLN K '2 	

   % /:.9*K ((3.9   E!!%!@!@!M ');<<+K=8MN s   >Ir   )   r   r      r   NF)rI   r   r   r   r   r   r   r   r(   r)   r=   r   r   r   r-   r   r;   rL   r   r   r:   rY   r   ro   r   r   r1   r"   r   r   &  sl   #D #S # #





	eEJJ$$ejj&6&667	8

	
	
!%eEJJ,<,<ejj>N>N,N&O!P	
		
 % !"#&"&( 26',ppBG$p p !	p
 p !$p p 5<<(4/p !%p p 
e	p  %pr1   r   )r   N)3r8   r   ru   collections.abcr   r   r   	itertoolsr   
statisticsr   typingr   r	   r
   r   typing_extensionsr   r   r   r=   torch._inductor.configr>   r   rj   torch.utils._pytreeutils_pytreerW   torch._dynamo.utilsr   r   torch.utils._debug_moder   _logginggetArtifactLoggerrI   loggeruse_experimental_benchmarkerr   is_availablerx   r   r   r;   r@   rL   rN   r   r   benchmarkerr   r1   r"   <module>r      s:      $ ,   4 4 6 6  0 0 $ $ 6 - 
	)	)(N	C00NUZZ5L5L5N 
  cNCLc(: xS?Q DS!V$a'(k#q&!1$%$Q" Q"h.O .ObQ+ Qj : ?P?R r1   