
    ȅi                    L   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKJrJrJr  S SKJrJrJr  S SKJrJrJrJr  S SKJrJ r J!r!J"r"J#r#  S SK$r$S SK%r$S SK&J'r'  S SK(J)r)  S S	K*J+r+  S S
K,J-r-J.r.J/r/J0r0J1r1  S SK2J3r3J4r4J5r5J6r6J7r7  S SK8J9r9  S SK:J;r;  \"(       a  S SK<J=r=  S SK>J?r?J@r@JArA  SSKBJCrC  SSKDJErE  SSKFJGrG  SrH\9" \IS5      rJ " S S\K5      rL " S S5      rM " S S5      rN\#\+R                  \+R                  4   rQ\R                   " S S5      5       rS\R                   " S S5      5       rT " S  S!\T5      rU " S" S#5      rV " S$ S%5      rW " S& S'\T5      rX " S( S)\V\X5      rY " S* S+\W\X5      rZ " S, S-\T5      r[ " S. S/\V\[5      r\ " S0 S1\W\[5      r] " S2 S3\V\T5      r^ " S4 S5\W\T5      r_ " S6 S7\V\T5      r`\R                  SBS8 j5       rb    SCS9 jrc " S: S;5      rdSDS< jre    SES= jrf " S> S?5      rg " S@ SA5      rhg)F    )annotationsN)CallableIterableSequence)FutureProcessPoolExecutorThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)do_bench_using_profilingget_gpu_typeget_ld_library_pathis_gpupython_subprocess_env)getArtifactLogger)
OrderedSet)
ModuleType)ChoiceCallerPartialRenderTritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      \ rS rSrSrg)!NonzeroWorkspaceNotSupportedErrorB    N__name__
__module____qualname____firstlineno____static_attributes__r/       Z/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/autotune_process.pyr-   r-   B       r6   r-   c                      \ rS rSrSr\SS j5       r\ S       SS jj5       r\SS j5       rSS jr	S r
SS	 jrSSS
 jjrSSS jjrSSS jjrSS jrSS jrSS jrSS jrSrg)TuningProcessF   z>
Class to launch and interact with a benchmarking subprocess.
c                   ^ ^ [         R                  S[        R                  " 5       [        R                  R                  [        5      5        U U4S jn U" 5         g! [         a     gf = f)z$
Entry point for the child process.
z3Started autotune subprocess %s. Visible devices: %sc                    >  [         R                  T5      u  pU c  g  U(       a  [        R                  R	                  U5        U " 5       n[         R                  UT5        Ma  ! [
         a  nUn S nAN)S nAff = fN)r:   recvosenvironupdate	Exceptionsend)job	extra_envresulte	read_pipe
write_pipes       r7   workloop,TuningProcess.process_main.<locals>.workloopV   so    !.!3!3I!>; 

)))4 UF ""6:6  ! Fs   -A$ $
A:.A55A:N)autotuning_logdebugr@   getpidrA   getr*   EOFError)rI   rJ   rK   s   `` r7   process_mainTuningProcess.process_mainK   sQ    
 	AIIKJJNN/0	
	7	J 		s   A 
A+*A+Nc                T    [         R                  " X4U5        UR                  5         g r>   )pickledumpflush)objrJ   rF   s      r7   rD   TuningProcess.sendj   s!     	S$j1r6   c                .    [         R                  " U 5      $ r>   )rU   load)rI   s    r7   r?   TuningProcess.recvq   s    {{9%%r6   c                0    Xl         U R                  5         g r>   )devicestart)selfr^   s     r7   __init__TuningProcess.__init__u   s    

r6   c                   [         R                  R                  [         R                  R                  [        5      S5      n[         R
                  " 5       u  p#[         R
                  " 5       u  pE[         R                  " US5      U l        [         R                  " US5      U l        [        R                  " 5       U l        U R                  R                  U R                  [        R                  5        [        R                  US[         R                   " 5        3S[#        U5       3S[#        U5       3/n0 [%        5       ES['        5       [(        R*                  (       a  SOSS	.EnU R,                  b  [#        U R,                  5      U[.        '   [0        R2                  " UUX%4S9U l        [         R6                  " U5        [         R6                  " U5        SU l        g
)z$
Start the benchmarking subprocess.
z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=01)TORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)r@   pathjoindirname__file__pipefdopenrJ   rI   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerO   strr   r   r'   /profile_bandwidth_with_do_bench_using_profilingr^   r*   
subprocessPopenprocesscloserunning)r`   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmdrk   s           r7   r_   TuningProcess.starty   so    RWW__X68NO$&GGI!$&GGI!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01

#%
  #24 EE DG
 ;;"(+DKK(8C$%!''%8

 	!
!"r6   c                `    U R                   =(       a    U R                  R                  5       SL $ )z*
True if the subprocess is still running.
N)r   r~   pollr`   s    r7   aliveTuningProcess.alive   s%     ||; 1 1 3t ;;r6   c                    U R                  5       (       d  U R                  5         [        R                  XR                  US9  g)z(
Push a work item to the child process.
rF   N)r   r_   r:   rD   rJ   )r`   reqrF   s      r7   putTuningProcess.put   s/     zz||JJL39Er6   c                    U R                   R                  U5      (       d"  [        SU R                  R                   35      e[
        R                  U R                  5      u  p#[        U[        5      (       a  UeU$ ! [         a    U R                  5         e [         a    U R                  5         e [         a<    [        R                  SU R                  R                  5        U R                  5         e f = f)zs
Get a response from the child process. Raises TimeoutError on timeout;
raises EOFError if the subprocess crashes.
zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)ru   selectTimeoutErrorr~   pidr:   r?   rI   killrQ   r   rC   rM   	exception
isinstance)r`   timeoutrG   _s       r7   rP   TuningProcess.get   s    
	==''00"%DT\\EUEUDV#WXX%**4>>:IF fi((L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   A#A> >A:C8c                    U R                  5       (       a   [        R                  SU R                  5        U(       a  U R	                  5         gg)z3
Signal the child process to shut down gracefully.
N)r   r:   rD   rJ   waitr`   r   s     r7   shutdownTuningProcess.shutdown   s4     ::<<tT__5IIK r6   c                    U R                  5       (       a  U R                  R                  5         U R                  5         g)z%
Wait for the child process to exit.
N)r   r~   r   r   r   s    r7   r   TuningProcess.wait   s(     ::<<LL

r6   c                    U R                   R                  5         U R                  R                  5         U R                  R                  5         SU l        g)z
Close resources.
FN)ru   r   rI   rJ   r   r   s    r7   r   TuningProcess.close   s;     	r6   c                    U R                  5       (       aD  [        R                  SU R                  R                  5        U R                  R                  5         U R                  5         g)z&
Send a SIGKILL to the child process.
z)Sending SIGKILL to autotune subprocess %dN)r   rM   errorr~   r   r   r   r   s    r7   r   TuningProcess.kill   sH     ::<<  ;   LL

r6   c                B    U R                  SS9  U R                  5         g)z(
Gracefully restarts the child process.
Tr   N)r   r_   r   s    r7   restartTuningProcess.restart   s     	4 

r6   )r^   r~   rI   r   ru   rJ   )rI   	IO[bytes]rJ   r   returnNoner>   )rX   r   rJ   r   rF   dict[str, str] | Noner   r   )rI   r   r   r   )r^   Optional[int]r   bool)r   r   rF   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r1   r2   r3   r4   __doc__staticmethodrR   rD   r?   ra   r_   r   r   rP   r   r   r   r   r   r5   r/   r6   r7   r:   r:   F   s      < LP'4I	  & &+Z<F6
r6   r:   c                  \    \ rS rSrSrS
S jr\SS j5       rS
S jrSS jr	    SS jr
Srg	)TuningProcessPool   z
Maintains a pool of TuningProcesses to benchmark kernels in parallel
across devices. By default, we create one TuningProcess per device and
set the sub-process environment to make only that device visible.
c                V   U R                  5       n[        R                  SU5        U Vs/ s H  n[        US9PM     snU l        [
        R                  " 5       U l        U R                   H  nU R                  R                  U5        M      [        [        U5      S9U l        gs  snf )z
Start the child processes.
z$Sub-process autotune device list: %sr^   max_workersN)get_device_listrM   rN   r:   	processesqueueQueueprocess_queuer   r	   lenexecutor)r`   devicesr^   ps       r7   ra   TuningProcessPool.__init__  s     &&(CWM FMMW6-v6WM9>A""1%   +s7|D Ns   B&c                    [         R                  (       d  S/$ [        5       n [        U 5      nUR	                  5       n[
        [        R                  ;   aR  [        R                  [
           R                  S5       Vs/ s H  n[        U5      PM     nn[        U5      U::  d   eU$ [        [        U5      5      $ s  snf )z4
Gather the list of devices to be used in the pool.
N,)r'   autotune_multi_devicer   r   device_countr*   r@   rA   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r7   r   !TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   >B<c                    U R                   R                  5         U R                   H  nUR                  SS9  M     U R                   H  nUR                  5         M     g)z%
Signal all child processes to exit.
Fr   N)r   r   r   r   )r`   r   s     r7   r   TuningProcessPool.shutdown,  sG     	 AJJEJ"  AFFH  r6   c                X   UR                   c   eSS/nU Vs0 s H,  o3[        R                  ;   d  M  U[        R                  U   _M.     nnU R                  R	                  5       nUR                  UR                   R                  US9   UR	                  [        R                  5      U R                  R                  U5        $ s  snf ! [         aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ [         ai  n[        R                  " SU S35        S	[        U5      ;   a  UR                  5         [        S5      s SnAU R                  R                  U5        $ SnAff = f! U R                  R                  U5        f = f)
z
Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
remove it from the queue, execute the benchmark in that subprocess, and return
the TuningProcess to the queue.
NTORCHINDUCTOR_CACHE_DIRTRITON_CACHE_DIRr   zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice 'cudaErrorLaunchFailure)bmreqr@   rA   r   rP   r   	benchmarkr'   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   rC   rz   r   )r`   choiceenv_varsvrF   r~   process_exceptions          r7   targetTuningProcessPool.target7  sz    ||'''-/AB/7Kx!

?%Q

1%x	K$$((*FLL**i@	,;;BB. ""7+7 L  	 MM1& :W W
 < ""7+  	 MM.vh 7W W (3/@+AA!<""7+	  ""7+sG   CCC /F	:F 	F	 AF#F	$F F		F F)c           	     v    [        [        XR                  R                  U R                  U5      5      5      nU$ )z.
Benchmark each choice in a separate process.
)dictzipr   mapr   )r`   choicesresultss      r7   r   TuningProcessPool.benchmark]  s-     s7MM$5$5dkk7$KLMr6   )r   r   r   Nr   )r   zSequence[Optional[int]])r   r%   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])r1   r2   r3   r4   r   ra   r   r   r   r   r   r5   r/   r6   r7   r   r      sC    E& " "(	$,L+ 
+r6   r   c                  |    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SrS\S'   \    SS j5       rSS jrSr	g)
TensorMetaio  ztorch.devicer^   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec                D   [        U[        5      (       a;  U Vs/ s H  o R                  U5      PM     nn[        S U 5       5      (       d   eU$ Un[        U[        R
                  5      (       a  [        R                  " SUS9nUR                  5       nUc   eUR                  5       nUc   e[        UU[        R                  R                  R                  UR                  5       [        R                   S9[        R                  R                  R                  UR#                  5       [        R                   S9[        R                  R                  R%                  UR'                  5       R(                  [        R                   S9UR+                  5       S9$ s  snf )Nc              3  B   #    U  H  n[        U[        5      v   M     g 7fr>   )r   r   .0xs     r7   	<genexpr>*TensorMeta.from_irnodes.<locals>.<genexpr>~  s     A&Qz!Z00&s   fake)r   layout)fallback)r^   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r)   graphsizevars
size_hintsget_sizer'   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   rG   noder   r^   s          r7   r   TensorMeta.from_irnodesx  s]    gx((>E Fg!1!1!!4gF FA&AAAAAMdBII&&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    [        U R                  U R                  U R                  U R                  U R
                  S9$ )N)r^   r   
extra_size)r   r   r   r^   r   r   r   s    r7   	to_tensorTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r6   r/   )r  z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)
r1   r2   r3   r4   __annotations__r   classmethodr   r  r5   r/   r6   r7   r   r   o  sQ    ((++KD-!
E!
	,!
 !
F
r6   r   c                      \ rS rSrSr          SS jr      SS jrSS jrSS.     SS jjrSS.     SS	 jjr	S
r
g)BenchmarkRequesti  a  
Only handle triton template benchmark for now. The extern kernel benchmark
can be done inside the same process since they usually don't cause crash.

Important: Instances of this class and subclasses have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                $  ^ Xl         [        U[        5      (       a	  U/U l        OX l        T(       aQ  [        T[        [
        45      (       a6  [        T5      S:  a  [        U4S jT 5       5      (       d   eTS   U l        OTU l        X@l	        g )Nr&   c              3  n   >#    U  H*  nS   H   n[        TS   U5      [        X5      :H  v   M"     M,     g7f))r^   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r7   r   ,BenchmarkRequest.__init__.<locals>.<genexpr>  s=      / Q .q148GA<LL Q M/s   25r   )
kernel_namer   r   input_tensor_metatupler   r   r  r!  
extra_args)r`   r#  r$  r!  r&  s      ` r7   ra   BenchmarkRequest.__init__  s     ''448I7JD"7H"*-?%"O"O%&* /    
 '9&;D#2DD#$r6   c                   [         er>   NotImplementedErrorr`   outinput_tensorss      r7   make_run_fnBenchmarkRequest.make_run_fn  s
     "!r6   c                    g r>   r/   r   s    r7   cleanup_run_fnBenchmarkRequest.cleanup_run_fn  s    r6   Nr,  c                   [         er>   r)  r`   fnr,  r-  s       r7   do_benchBenchmarkRequest.do_bench  s
     "!r6   c                  [         R                  [        R                  5      nU(       a  [        R                  " 5       nUcp  U R
                  (       a  U R                  (       d   S5       e[        U5      S:X  d   e[        S U R
                   5       5      nU R                  R                  5       nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       n U R                  " USU06nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       nU R                  " U/UQUP76 nU(       a:  [        R                  " 5       W-
  n	[         R                  S[!        U 5      WWU	5        U R#                  5         U$ ! [         a#    [         R                  S5        [        S5      s $ f = f)NzJInput and output tensor meta must be populated when input_tensors is emptyr   c              3  @   #    U  H  oR                  5       v   M     g 7fr>   )r  r   s     r7   r   -BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA++--9Os   r,  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rM   isEnabledForloggingDEBUGtimer$  r!  r   r%  r  r.  r-   infor   r7  rN   rz   r1  )
r`   r,  r-  rN   start_tscreate_tensor_elapser6  load_elapseresbench_elapses
             r7   r   BenchmarkRequest.benchmark  so   
 ++GMM:yy{H ;))d.E.E \E }%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!=:c:B ))+0Kyy{HmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   (F *GG)r&  r$  r#  r!  )
r#  rz   r$  r  r!  r  r&  Iterable[Any]r   r   r-  r  r,  r  r   zCallable[[], None]r   r-  r  r,  Optional[torch.Tensor]r   r   )r1   r2   r3   r4   r   ra   r.  r1  r7  r   r5   r/   r6   r7   r  r    s    %% ?% @	%
 "% 
%:"*"1="	"
 '+	" %" $	"
 
" '+,$, $, 
	, ,r6   r  c                  ^    \ rS rSrSr     S         S	S jjrSS.     S
S jjrSrg)_TestBenchmarkRequesti  z
Supports unit testing. Defined in this file instead of the test file so the
TuningProcess sub-process can unpickle these objects.
Nc                @    Xl         X l        X0l        X@l        XPl        g r>   )rG   r^   sleepexccrash)r`   rG   r^   rN  rO  rP  s         r7   ra   _TestBenchmarkRequest.__init__  s     

r6   r3  c                  U R                   b=  [        R                  R                  [        S 5      [        U R                   5      :X  d   eU R                  (       a   [        R                  " U R                  5        U R                  (       a  U R                  eU R                  (       a  [        R                  " S5        U R                  $ )Nr&   )r^   r@   rA   rP   r*   rz   rN  r?  rO  rP  rx   exitrG   r+  s      r7   r   _TestBenchmarkRequest.benchmark   sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r6   )rP  r^   rO  rG   rN  )        NNNF)
rG   r   r^   r   rN  zOptional[float]rO  zOptional[Exception]rP  r   rI  )r1   r2   r3   r4   r   ra   r   r5   r/   r6   r7   rL  rL    sv      $!%#'  	
 !  KO*1G	 r6   rL  c                  0    \ rS rSrSS.     SS jjrSrg)GPUDeviceBenchmarkMixini.  Nr3  c                  [        S / UQUP 5       5      n[        U5      S::  d
   SU 35       e[        S U 5       S5      n[        U5      n[        U5      S:X  a  [        [	        U5      5      nOUR                  5       nUR                  U5         [        R                  " XS9nUR                  5         S S S 5        U$ ! , (       d  f       W$ = f)Nc              3    #    U  H{  n[        U[        R                  5      (       d  M$  [        UR                  R
                  5      (       d  MJ  UR                  R                  c  Mc  UR                  R                  v   M}     g 7fr>   )r   torchTensorr   r^   typeindexr   tensors     r7   r   3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>5  s^      $
/&%,,/   v}}))*   ##	  FMM/s   #B"BB(Br&   zCan not mix devices c              3     #    U  HA  n[        UR                  R                  5      (       d  M)  UR                  R                  v   MC     g 7fr>   )r   r^   r\  r^  s     r7   r   r`  >  s5      +F&--,,- #""+s
   (AAcudar   )
r!   r   nextr   itercurrent_devicer^   r(   r   synchronize)	r`   r6  r,  r-  device_idx_setdevice_typer   
device_idxrD  s	            r7   r7   GPUDeviceBenchmarkMixin.do_bench/  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0''?C((* 1 
	 10 
s   %C
Cr/   rI  r1   r2   r3   r4   r7  r5   r/   r6   r7   rW  rW  .  s/    
 '+	 % $	
 
 r6   rW  c                  0    \ rS rSrSS.     SS jjrSrg)CPUDeviceBenchmarkMixiniQ  Nr3  c               .    [         R                  " U5      $ r>   )r(   benchmark_cpur5  s       r7   r7   CPUDeviceBenchmarkMixin.do_benchR  s     ((,,r6   r/   rI  rk  r/   r6   r7   rm  rm  Q  s/    
 '+	- %- $	-
 
- -r6   rm  c                     ^  \ rS rSr     S                           SU 4S jjjr      S	S jrS rS
S jrSrU =r	$ )TritonBenchmarkRequesti[  c                   > [         TU ]  XX45        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        Xl
        g r>   )superra   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpack)r`   r#  r$  r!  r&  ru  rv  rw  rx  ry  rz  r{  r|  r}  	__class__s                 r7   ra   TritonBenchmarkRequest.__init__^  sH      	9KX& 0$"#6 %:"$8!(
r6   c                  [         R                  " U R                  U R                  5      n[        R                  SU R                  U R                  5        [        X0R                  5      R                  n[        U R                  5      nSUR                  l        0 nSS KnSUR                  U5      R                  ;   a  SUS'   UR                   R"                  S:X  a  SnOPUR                   R"                  n	[%        U	5      n
U
R'                  U R(                  R                   R*                  5      n[-        [        X0R                  5      [.        R0                  R2                  R4                  R6                  5      (       a"  [8        R:                  " U/UQUPUQ70 UDSU0D6$ [8        R:                  " U/UQUPUQ70 UDUSS.D6$ )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)r  benchmark_run)r   load_by_key_pathrv  ru  rM   rN   r  r#  runr   r&  __self__with_bandwidth_infoinspect	signature
parametersr^   r\  r   get_raw_streamr!  r]  r   rZ  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)r`   r,  r-  mod
run_methodr&  
warmup_argr  r  rh  r   s              r7   r.  "TritonBenchmarkRequest.make_run_fny  s    **4+@+@$BRBRS0!!	
 S"2"2377
$//*
27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 
 $$  	
    $$  	
  " r6   c                    [         R                  " U R                  U R                  5      n[	        XR
                  5      R                  5         g r>   )r   r  rv  ru  r  r#  
precompile)r`   r  s     r7   r  !TritonBenchmarkRequest.precompile  s7    **4+@+@$BRBRS%%&113r6   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r#  ru  rv  r   s    r7   __str__TritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr6   )	r}  r{  rv  ru  rz  ry  rw  rx  r|  )r   r   r   r   r   )r#  rz   r$  r  r!  r  r&  rG  ru  rz   rv  rz   rw  r   rx  r   ry  r   rz  r   r{  r   r|  r   r}  r   r   r   rH  r   rz   )
r1   r2   r3   r4   ra   r.  r  r  r5   __classcell__r~  s   @r7   rr  rr  [  s     $%%&$% ? @	
 "     !  # "   
 64*41=4	4l4U Ur6   rr  c                      \ rS rSrSrg)TritonGPUBenchmarkRequesti  r/   Nr0   r/   r6   r7   r  r    r8   r6   r  c                      \ rS rSrSrg)TritonCPUBenchmarkRequesti  r/   Nr0   r/   r6   r7   r  r    r8   r6   r  c                     ^  \ rS rSrSr  S               SU 4S jjjr      SS jrSS.   SU 4S jjjrSS jrS	 r	SS
 jr
SrU =r$ )ExternKernelBenchmarkRequesti  a  
A class to handle extern kernel benchmark requests. This allows extern kernels
(like aten::mm) to be benchmarked in a subprocess, similar to Triton kernels.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
Nc                ^   > [         TU ]  XX45        XPl        U=(       d    0 U l        Xpl        g r>   )rt  ra   callable_pathkwargshas_out_variant)	r`   r#  r$  r!  r&  r  r  r  r~  s	           r7   ra   %ExternKernelBenchmarkRequest.__init__  s,     	9KX*l.r6   c                   U R                  5       nU R                  (       a  [        R                  " U/UQ7SU06$ [        R                  " U/UQ76 $ )Nr,  )to_callabler  r  r  )r`   r,  r-  r6  s       r7   r.  (ExternKernelBenchmarkRequest.make_run_fn  sN     $$RA-ASAA $$R8-88r6   r3  c               *  >^^ Ub  UR                  5       S:X  a  gU R                  (       d  [        T5      S:X  a  [        TU ]  " TSU06$ U R                  5       mT" T6 nUbt  [        R                  R                  R                  R                  U[        UR                  5       5      [        UR                  5       5      5        UR                  U5        [        R                   (       a  [#        UU4S j5      $ [$        R                  " TT0 5      $ )Nr   rU  r,  c                    > T " T6 $ r>   r/   )algor-  s   r7   <lambda>8ExternKernelBenchmarkRequest.benchmark.<locals>.<lambda>  s
    m8Lr6   )numelr  r   rt  r   r  rZ  _C_dynamoguardsassert_size_strider%  sizestridecopy_r'   r{   r   r(   )r`   r,  r-  out_newr  r~  s     ` @r7   r   &ExternKernelBenchmarkRequest.benchmark  s     ?syy{a/3}#5#:7$m===##%DM*G  ''::U388:.cjjl0C 		'"EE/0LMM((}bAAr6   c                    g r>   r/   r   s    r7   r  'ExternKernelBenchmarkRequest.precompile  s    r6   c                    SSK Jn  [        XR                  5      nU R                  (       a!  [
        R                  " U40 U R                  D6$ U$ )Nr   )extern_kernels) torch._inductor.select_algorithmr  r  r#  r  r  r  )r`   r  r6  s      r7   r  (ExternKernelBenchmarkRequest.to_callable  s>     	D^%5%56;;$$R74;;77	r6   c                "    SU R                    S3$ )NzExternKernelBenchmarkRequest())r  r   s    r7   r  $ExternKernelBenchmarkRequest.__str__  s    .t/A/A.B!DDr6   )r  r  r  )NT)r#  rz   r$  r  r!  r  r&  rG  r  rz   r  zOptional[dict[str, Any]]r  r   r   r   rH  )r-  r  r,  rJ  r   r  )r1   r2   r3   r4   r   ra   r.  r   r  r  r  r5   r  r  s   @r7   r  r    s     ,0 $// ?/ @	/
 "/ / )/ / 
/ /	9*	91=	9		9 KOB*B1GB B(
E Er6   r  c                      \ rS rSrSrg)ExternKernelGPUBenchmarkRequesti
  r/   Nr0   r/   r6   r7   r  r  
       	r6   r  c                      \ rS rSrSrg)ExternKernelCPUBenchmarkRequesti  r/   Nr0   r/   r6   r7   r  r    r  r6   r  c                     ^  \ rS rSrSr            SU 4S jjrS r      SS jrSS jrS r	SS jr
SS	 jrS
rU =r$ )CUDABenchmarkRequesti  aM  
A class to handle CUDA (CUTLASS) benchmark requests. This class is for
managing the lifecycle of a CUDA kernel benchmark, including compiling
the source code, managing workspace memory, and executing the kernel.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
c                   > [         TU ]  XX45        XPl        SU l        S U l        S U l        SU l        SU l        SU l        [        R                  " U R                  S5      u  U l        U l        g )Nr   F so)rt  ra   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writer`   r#  r$  r!  r&  r  r~  s         r7   ra   CUDABenchmarkRequest.__init__   sk     	9KX&#$15)-',$ "*7*=*=d>N>NPT*U't'r6   c                    [         R                  SU 5        [        R                  " U R                  S5        [         R                  SU 5        g)zk
Precompile the CUDA source code to populate the CUDACodeCache.
This may happen in a separate thread pool.
Precompiling %sr  Done precompiling %sN)rM   rN   r   compiler  r   s    r7   r  CUDABenchmarkRequest.precompile2  s<    
 	.5d..53T:r6   c          	       ^ U R                  5         U R                  5         [        U5      U/-    Vs/ s H  n[        UR	                  5       5      PM     nn[
        R                  SU R                  U R                  U R                  U R                  UU R                  5        [        [        R                  R                  5       R                  5      n[!        U R                  U R                  5      n[        S5      nU R"                  S:  af  [        R$                  " U R"                  S-   S-  [        R&                  UR(                  S9U l        [        U R*                  R	                  5       5      n[,        R.                  " U/UQU R                  QSPUPUP76 n U" 5         U$ s  snf ! [0         a-  n	[3        U	5      mU4S jn
U R5                  5         U
s Sn	A	$ Sn	A	ff = f)zS
Create a function to run the CUDA kernel with the given input and output tensors.
zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   r^   Nc                    > [        T 5      er>   )RuntimeError)err_msgs   r7   raise_runtime_error=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_errori  s    "7++r6   )ensure_dll_loadedupdate_workspace_sizer   r   data_ptrrM   rN   r#  r  r  r  r&  rZ  rb  current_streamcuda_streamr  r  zerosfloat64r^   r  r  r  r  rz   r1  )r`   r,  r-  r_  args
stream_ptrr  workspace_ptrretrH   r  r  s              @r7   r.   CUDABenchmarkRequest.make_run_fn;  s    	 ""$:>}:MQTPU:UV:U*+:UVMMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
W WD  	'!fG, !&&	's#   #F*!F/ /
G&9"G!G&!G&c           
        U R                   (       a  g U R                  5         [        [        R	                  S U R
                   5       5      5      n[        US-   5       Vs/ s H  n[        S 5      PM     nn[        [        R                  R                  5       R                  5      n[        U R                  U R                  5      n[        5       nU" / UQU R                   Q[#        U5      PS PUP76   [        R                  R%                  5         UR&                  U l        [*        R-                  SU R(                  U R                  U R.                  U R0                  U R                  UU R                   5        SU l         g s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr>   )r   )r   metas     r7   r   =CUDABenchmarkRequest.update_workspace_size.<locals>.<genexpr>v  s     G0F))0Fs   r&   zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r   fromkeysr$  r   r   rZ  rb  r  r  r  r  r#  r   r&  r
   rf  valuer  rM   rN   r  r  )r`   unique_input_countr   r  r  r  c_workspace_sizes          r7   r  *CUDABenchmarkRequest.update_workspace_sizeq  sX   ''  MMG0F0FGG
 )..@1.D(EF(E1(EFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$7 Gs   "Fc                    U R                   c5  [        R                  " U R                  S5      u  U l         U l        U l        g g )Nr  )r  r   r[   r  r  r  r   s    r7   r  &CUDABenchmarkRequest.ensure_dll_loaded  s:    888E8J8J  $95DHdmT%5 r6   c                n    U R                   b!  U R                   R                  5         S U l         S U l        g r>   )r  r   r  r   s    r7   r1  #CUDABenchmarkRequest.cleanup_run_fn  s(    88HHNNDHr6   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nr  z, self.source_file=z, self.hash_key=)r#  r  r  r   s    r7   r  CUDABenchmarkRequest.__str__  s0    #$""$$8t'7'7&99JDMM;KLLr6   )r  r  r  r  r  r  r  r#  rz   r$  r  r!  r  r&  rG  r  rz   r   r   rH  r   r  )r1   r2   r3   r4   r   ra   r  r.  r  r  r1  r  r5   r  r  s   @r7   r  r    s    VV ?V @	V
 "V V 
V$;4*41=4	4l",HM Mr6   r  c                  j   ^  \ rS rSr            SU 4S jjrS r      SS jrS	S jrSrU =r	$ )
CppBenchmarkRequesti  c                `   > [         TU ]  XX45        XPl        [        U5      U l        S U l        g r>   )rt  ra   r  r   r  r  r  s         r7   ra   CppBenchmarkRequest.__init__  s.     	9KX& -6:r6   c                    [         R                  SU 5        [        R                  " U R                  SS9  [         R                  SU 5        g )Nr  r  rh  r  )rM   rN   r   r[   r  r   s    r7   r  CppBenchmarkRequest.precompile  s<     	.5$**>3T:r6   c               h   [         R                  " U R                  SS9U l        [	        U5      U/-    Vs/ s H  o3R                  5       PM     nn[        R                  SU R                  U R                  UU R                  5        [        U R                  U R                  5      n[        S U R                   5       5      (       d   e[        R                  /[        U5      [        [	        U R                  5      5      -   -  Ul        [         R"                  " U/UQU R                  Q76 $ s  snf )Nr  r	  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr>   )r   ctypesc_ulonglong)r   args     r7   r   2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>  s      R/3:c6#5#566/s   '))r   r[   r  r  r   r  rM   rN   r#  r&  r  r  r  r  r   argtypesr  r  )r`   r,  r-  r_  r  r  s         r7   r.  CppBenchmarkRequest.make_run_fn  s     $$T%5%55I04]0Cse0KL0Kf!0KLXHHOO	
 TXXt'7'78
R$//RRRRR%112ID122


   

 __
 	
! Ms   D/c                "    SU R                   < 3$ )Nr  )r#  r   s    r7   r  CppBenchmarkRequest.__str__  s    #$""$%%r6   )r  r  r  r  rH  r  )
r1   r2   r3   r4   ra   r  r.  r  r5   r  r  s   @r7   r  r    so    ;; ?; @	;
 "; ; 
;;
*
1=
	
6& &r6   r  c                  ^   ^  \ rS rSrSr            SU 4S jjr      SS jrSrU =r$ )CuteDSLBenchmarkRequesti  z;Benchmark request for CuteDSL (CUTLASS Python DSL) kernels.c                   > [         TU ]  XX45        UR                  5       n[        R                  " U5      u  U l        U l        g r>   )rt  ra   finalize_allr   r  rv  ru  )r`   r#  r$  r!  r&  r  finalized_coder~  s          r7   ra    CuteDSLBenchmarkRequest.__init__  s>     	9KX$1132=2C2CN2S/t/r6   c          	     n  ^^^	 [         R                  " U R                  U R                  5      nSSKJn  U R                   SU 3n[        X55      (       dG  [        U5       Vs/ s H   n[        [        X65      5      (       d  M  UPM"     nn[        SU SU 35      e[        X55      m	UU	U4S jnU$ s  snf )z
Create a function to run the CuteDSL kernel with the given input and output tensors.
Similar to TritonBenchmarkRequest.make_run_fn but for CuteDSL kernels.
r&   )MAIN_SUFFIXr   z-Could not find CuteDSL main kernel function 'z'. Available callables: c                 ~   > [        S5      n U R                  TR                  R                  5      nT" / TQTP7SU06$ )Nrb  r  )r   r  r^   r]  )r   r  r-  kernel_funcr,  s     r7   
run_kernel7CuteDSLBenchmarkRequest.make_run_fn.<locals>.run_kernel  s@    7?%44SZZ5E5EFFBBsB6BBr6   )r   r  rv  ru  codegen.cutedsl.cutedsl_kernelr  r#  hasattrdircallabler  r  )
r`   r,  r-  r  r  main_func_namer   	availabler  r  s
    ``      @r7   r.  #CuteDSLBenchmarkRequest.make_run_fn  s     **4+@+@$BRBRS 	@ ,,-Q{m<s++*-c(S($hws?Q6R(IS??OOghqgrs  c2	C
  Ts   #B2B2)rv  ru  )r#  rz   r$  r  r!  r  r&  ztuple[Any, ...]r  r$   r   r   rH  )	r1   r2   r3   r4   r   ra   r.  r5   r  r  s   @r7   r  r    so    ETT ?T @	T
 $T #T 
T*1=	 r6   r  c                 Z    [        5       n [        R                  " U R                  5        U $ r>   )r   atexitrv   r   )pools    r7   get_tuning_process_poolr+  
  s    D
OODMM"Kr6   c                4    [        5       R                  U 5      $ )zG
Do benchmarking in a subprocess and return the perf number (latency).
)r+  r   )r   s    r7   benchmark_in_sub_processr-    s     #$..w77r6   c                      \ rS rSr% SrSrS\S'   \R                  " 5       r	S\S'   S r
\S	 5       r\S
 5       rS rSS jrSS jrSS jrS r\S 5       rSrg)AutotuneProcessPooli  zf
Singleton pool manager for running autotuning (precompilation + benchmarking)
in a separate process.
NzOptional[AutotuneProcessPool]	_instancezthreading.Lock_lockc                J    U R                  5       U l        S U l        S U l        g r>   )
_init_pool_pool_warmup_future_warmup_start_timer   s    r7   ra   AutotuneProcessPool.__init__#  s     151B
2604r6   c                    U R                   c:  U R                     U R                   c  U " 5       U l         SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z*Get or create the singleton pool instance.N)r0  r1  r  s    r7   get_instance AutotuneProcessPool.get_instance(  sV     == ==($'ECM  }}s}}	  }}s   A
A-c                ^    U R                   c  U R                  5       U l         U R                   $ )zGet the process pool.)r4  r3  r   s    r7   r*  AutotuneProcessPool.pool2  s&     ::*DJzzr6   c                    [         R                  " S5      n[        SUS9n[        R                  " U R
                  5        [        R                  S5        U$ )z
Get or create the process pool.

Uses ProcessPoolExecutor with 'spawn' context for CUDA safety.
ProcessPoolExecutor is lazily initialized - workers are not spawned
until the first submit() call, making this property non-blocking.
spawnr&   )r   
mp_contextz2AutotuneProcessPool created (workers spawn lazily))mpget_contextr   r)  rv   	_shutdownrM   r@  )r`   ctxr*  s      r7   r3  AutotuneProcessPool._init_pool9  sH     nnW%"
 	'PQr6   c                   U R                   c  U R                     U R                   cx  [        R                  " 5       U l        U R
                  R                  [        5      U l         U R                   R                  U R                  5        [        R                  S5        SSS5        U R                   $ U R                   $ ! , (       d  f       U R                   $ = f)z
Submit a warmup job to eagerly spawn workers and initialize CUDA.

This is optional - call it early to hide spawn latency.
Returns the warmup future which can be ignored or awaited.
NzWarmup job submitted)r5  r1  r?  perf_counterr6  r*  submit_warmup_autotune_subprocessadd_done_callback_on_warmup_completerM   r@  r   s    r7   warm_upAutotuneProcessPool.warm_upM  s     &&&..2.?.?.AD+*.))*:*:;V*WD'''99$:R:RS"''(>?  """t"""  """s   BC  
Cc                
   SnU R                   b"  [        R                  " 5       U R                   -
  n UR                  5       n[        R                  SUU5        g! [         a  n[        R                  SU5        UeSnAff = f)z/Callback invoked when the warmup job completes.NzEAutotuneProcessPool warmup completed successfully in %.4f seconds: %sz4AutotuneProcessPool warmup failed after %.4f seconds)r6  r?  rG  rG   rM   r   rC   )r`   futurewarmup_elapsed_timerG   rH   s        r7   rK  'AutotuneProcessPool._on_warmup_complete^  s    """."&"3"3"58O8O"O	]]_F  W#
  	  F# G	s   'A 
B%A==Bc                B    U R                   R                  " U/UQ70 UD6$ )z-Submit a job to the pool and return a Future.)r*  rH  r`   r6  r  r  s       r7   rH  AutotuneProcessPool.submitr  s!    yy4T4V44r6   c                `    U R                   b!  U R                   R                  SS9  SU l         gg)zShutdown the pool on exit.NFr   )r4  r   r   s    r7   rC  AutotuneProcessPool._shutdownv  s-    ::!JJU+DJ "r6   c                    U R                   bD  U R                     U R                   b!  U R                   R                  5         SU l         SSS5        gg! , (       d  f       g= f)z+Explicitly shutdown the singleton instance.Nr0  r1  rC  r9  s    r7   shutdown_instance%AutotuneProcessPool.shutdown_instance|  sH     ==$==,MM++-$(CM  %s   /A
A!)r4  r5  r6  )r   Future[Any])rO  r[  r   r   )r1   r2   r3   r4   r   r0  r  	threadingLockr1  ra   r  r:  propertyr*  r3  rL  rK  rH  rC  rY  r5   r/   r6   r7   r/  r/    s    
 04I,3%NN,E>,5
    (#"(5 ) )r6   r/  c                 l    SSK n U R                  R                  5       (       a  U R                  " SSS9  g)z1
Warmup function run in the autotune subprocess.
r   Nr&   rb  r   T)rZ  rb  is_availabler  )rZ  s    r7   rI  rI    s-      zz  Af%r6   c                     U R                  5       nU$ ! [         a$    [        R                  SU 5        [	        S5      s $ f = f)a
  
Run autotuning benchmarks in a subprocess.

This function is submitted to AutotuneProcessPool and runs in isolation
to prevent GPU contention with the main compilation process.

Args:
    picklable_choices: List of picklable choice information

Returns:
    timing
zFailed to benchmark choice %sr   )r   rC   rM   r   r   )benchmark_requesttimings     r7   run_autotune_in_subprocessrd    sI     ",,. +	

 U|s    +AAc                      \ rS rSr% SrSrS\S'   \R                  " 5       r	SSS jjr
\SS j5       rS rSSS	 jjr\SS
 j5       rSrg)PrecompileThreadPooli  z
Thread pool for running precompilation asynchronously.

This allows the main compilation process to continue while
precompilation happens in background threads.
NzOptional[PrecompileThreadPool]r0  c                     [        US9U l        g )Nr   )r	   	_executor)r`   r   s     r7   ra   PrecompileThreadPool.__init__  s    +Dr6   c                    SSK Jn  U R                  c@  U R                     U R                  c  U " U" 5       5      U l        S S S 5        U R                  $ U R                  $ ! , (       d  f       U R                  $ = f)Nr   )get_num_workers)r  rk  r0  r1  )r  rk  s     r7   r:  !PrecompileThreadPool.get_instance  s]    D== ==($'(9$:CM  }}s}}  }}s    A  
A9c                B    U R                   R                  " U/UQ70 UD6$ r>   )rh  rH  rS  s       r7   rH  PrecompileThreadPool.submit  s!    ~~$$R9$9&99r6   c                4    U R                   R                  US9$ )Nr   )rh  r   r   s     r7   rC  PrecompileThreadPool._shutdown  s    ~~&&D&11r6   c                    U R                   bC  U R                     U R                   b   U R                   R                  SS9  S U l         S S S 5        g g ! , (       d  f       g = f)NFr   rX  r9  s    r7   rY  &PrecompileThreadPool.shutdown_instance  sK    ==$==,MM+++7$(CM  %s   .A
A )rh  )   )r   r   )r   rf  )F)r   r   r   )r1   r2   r3   r4   r   r0  r  r\  r]  r1  ra   r  r:  rH  rC  rY  r5   r/   r6   r7   rf  rf    sX     15I-4NNEE  :2 ) )r6   rf  c                  d    \ rS rSrSr0 r\SS j5       r\S	S j5       r	\      S
S j5       r
Srg)AsyncAutotuneri  a  
Handles asynchronous autotuning of kernel choices in a separate process.

This class manages the lifecycle of autotuning:
1. Accepts precompiled choices from the main process
2. Submits benchmarking work to AutotuneProcessPool
3. Returns results via a Future

Usage:
    autotuner = AsyncAutotuner(choices)
    autotuner.start()  # Kicks off async benchmarking
    timings = autotuner.get_results()  # Blocks until complete
c                (    U R                  5       U-   $ r>   )r  )r   
inputs_keys     r7   get_choice_hashAsyncAutotuner.get_choice_hash  s     :--r6   c                    U H  n[         R                  X25      nU[         R                  ;   a  M.  [        USS5      c   S5       e[        R                  5       R                  [        UR                  5      nU[         R                  U'   M     g)z
Start asynchronous autotuning in a subprocess.

This method:
1. Extracts picklable benchmark requests from choices
2. Submits benchmarking work to AutotuneProcessPool
3. Returns immediately (non-blocking)
r   Nzbmreq is None for choice)	ru  rx  choice_hash_to_futurer  r/  r:  rH  rd  r   )r  r   rw  r   choice_hashautotune_futures         r7   r_   AsyncAutotuner.start  s     F(88LKnBBB67D1= *= 2>>@GG*O
 APN00= r6   c                    0 nU H;  n[         R                  XB5      n[         R                  U   R                  5       X4'   M=     U$ )z
Get autotuning results, blocking until complete.

Args:
    timeout: Maximum time to wait in seconds. None means wait forever.

Returns:
    Dict mapping ChoiceCaller to benchmark timing
)ru  rx  r{  rG   )r  r   rw  timingsr   r|  s         r7   get_resultsAsyncAutotuner.get_results
  sE     F(88LK,BB;OVVXGO  r6   r/   N)r   r#   rw  rz   r   rz   )r   list[ChoiceCaller]rw  rz   )r   r  rw  rz   r   zdict[ChoiceCaller, float])r1   r2   r3   r4   r   r{  r   rx  r  r_   r  r5   r/   r6   r7   ru  ru    sc     . . P P6 (69	" r6   ru  )r   r   r   r   )rb  r  r   r   )i
__future__r   r)  r  dataclassesr  r=  multiprocessingrA  r@   rU   r   rs   r|   rx   r\  r?  r   collections.abcr   r   r   concurrent.futuresr   r   r	   r
   r   r   r   typingr   r   r   r   r   rZ  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   r   r   torch._loggingr    torch.utils._ordered_setr!   typesr"   r  r#   r$   r%   r  r'   runtime.benchmarkingr(   virtualizedr)   r*   r1   rM   rC   r-   r:   r   r  r  LayoutOrBuffer	dataclassr   r  rL  rW  rm  rr  r  r  r  r  r  r  r  r  cacher+  r-  r/  rI  rd  rf  ru  r/   r6   r7   <module>r     sT   "       	     
    8 8 N N 2 2 : :  $ C .    - /     -  . "8\:		 	t tnl l^ ryy"))+, 3
 3
 3
l b b bJ, D   F- -YU- YUx	 79O 		 79O 	HE#3 HEV	9		9	LM24D LM^4&13C 4&n+57G +\  8'8&8i) i)X
'
>$) $)NC Cr6   