
    ȅi0                       % S SK Jr  S SKrS SKrS SKJrJrJr  S SKJ	r	  S SK
r
S SKrS SKJr  S SKrS SKJr  SSKJrJr  SSKJrJrJrJrJrJrJr  SS	KJrJ r   SS
K!J"r"  SSK#J$r$J%r%  \(       a  S SK&J'r'J(r(  Sr)Sq*S\+S'   \RX                   " S S5      5       r-\RX                   " S S5      5       r.SS jr/S S jr0\Rb                  S!S j5       r2          S"S jr3 " S S\5      r4S#S jr5 " S S5      r6    S$S jr7      S%S jr8g)&    )annotationsN)AnyTYPE_CHECKINGUnion)patch)
OrderedSet   )configselect_algorithm)BufferChoiceCallerLayoutMultiTemplateBufferOperationBuffer
StorageBox	TensorBox)KernelInputsMMKernelInputs)SchedulerNode)NullHandlerV)	GeneratorSequencedistributed_autotunedist.ProcessGroup | None_AUTOTUNE_PGc                  6    \ rS rSr% SrSrS\S'   SrS\S'   Srg)	_DistributedAutotuneState'   z9
State used to track autotuning during a graph_context()
r   intautotuned_indexautotuned_local_count N)	__name__
__module____qualname____firstlineno____doc__r!   __annotations__r"   __static_attributes__r#       ^/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/distributed_autotune.pyr   r   '   s      OS "#3"r+   r   c                  *    \ rS rSr% S\S'   S\S'   Srg)_DistributedAutotuneInfo6   r    indexboollocalr#   N)r$   r%   r&   r'   r)   r*   r#   r+   r,   r.   r.   6   s    JKr+   r.   c                     [         R                  " 5       (       aD  [         R                  " 5       (       a*  [        c  [         R                  R                  SS9q[        $ g )Npt2_distributed_autotune_pg)pg_tag)distis_availableis_initializedr   distributed_c10d_new_group_with_tagr#   r+   r,   get_autotune_pgr;   <   sO    t224400DD4 E L r+   c                t    [         R                  (       d   e[        U 5      n[        U5      n[	        X5        g)z
Finish the distributed autotuning by propagating the autotuning results
between the ranks and then replacing the placeholder with the real Buffer.
N)r
   distributed_max_autotune_gemm_autotune_local_nodes_sync_autotune_remote_nodes)	schedulerautotune_resultschoices_by_indexs      r,   schedulerD   H   s2    
 ////,Y7-.97r+   c               #  &  #    [        [        R                  " SS9[        5      (       a   e[        R                  " [        5       5         Sv   [        R                  " [        5       5        g! [        R                  " [        5       5        f = f7f)zX
Wrapped around processing a graph, sets up figuring out which ranks tune
which shapes.
F)check_poisonedN)
isinstancer   get_distributed_autotune_stater   set_distributed_autotune_stater   r#   r+   r,   graph_contextrJ   S   sk      	((>!    $$%>%@A8	((7((7s   ABA. B. BBc                "   [         R                  (       d  g[        5       =n(       d  g[        U5      S::  a  g[        R
                  nUR                  nU=R                  S-  sl        XdR                  5       -  UR                  5       :H  n[        Xg5      [        R                  R                  [        '   U(       a  U=R                  S-  sl        g[        R                  R                   R"                  R%                  ['        XU5      5      $ )z
Used by an op (like `mm`) to determine if the op should be autotuned
locally (returns None) or remotely (returns a placeholder Buffer).
Nr	   )r
   r=   r;   lenr   distributed_autotune_stater!   sizerankr.   current_nodemeta_DISTRIBUTED_AUTOTUNE_KEYr"   torch	_inductorirr   create_DistributedAutotuneBuffer)namechoicesinputslayoutautotune_pgstater0   r2   s           r,   maybe_autotune_remoter^   d   s     //*,,K,
7|q((E!!E	Q$$&&+*:*:*<<E5M6ANN12 ##q(#??''.."48 r+   c                  h   ^  \ rS rSr% SrS\S'           S	U 4S jjr    S
S jrSS jrSr	U =r
$ )rW      z
A MultiTemplateBuffer which represents a kernel being autotuned on a
different rank. When `schedule` is called this will be replaced by the
"real" buffer.
str_kernel_namec           	     Z   > [         TU ]  UUU R                  / [        0 5      S9  Xl        g )N)choice_timings_fnunfiltered_choicesallowed_prologue_inps)super__init___dummy_choice_timingsr   rb   )selfkernel_namerZ   r[   	__class__s       r,   rh   #_DistributedAutotuneBuffer.__init__   s8     	"88!",R. 	 	
 (r+   c                    [         eN)NotImplementedError)rj   _hint_overrides     r,   ri   0_DistributedAutotuneBuffer._dummy_choice_timings   s
    
 "!r+   c                   SSK Jn  [        R                  " [        R
                  SS5         [        / U R                  Q5      n[        U R                  [        5      (       d   eUR                  U R                  U5      nU" U R                  U/UR                  5       U R                  5      n[        U[        5      (       d   eUsSSS5        $ ! , (       d  f       g= f)z]
Given a _SerializedChoice (autotune results from another rank)
compute the final TensorBox.
r	   )autotune_select_algorithmrA   N)r   rt   r   objectr   graphr   original_inputsrG   r[   r   
get_choicerb   nodesr   )rj   
ser_choicert   kernel_inputschoicebuffers         r,   autotune#_DistributedAutotuneBuffer.autotune   s     	@\\!'';5*+BT-A-A+BCMdkk62222**4;;FF.!!##%	F fi0000 655s   BC
C )rb   )rk   ra   rZ   list[Buffer]r[   r   returnNone)rq   z
int | Noner   zdict[ChoiceCaller, float])rz   _SerializedChoicer   r   )r$   r%   r&   r'   r(   r)   rh   ri   r~   r*   __classcell__)rl   s   @r,   rW   rW      sZ     (( ( 	(
 
( "("	"" r+   rW   c                   [        5       nU(       d   eS/UR                  5       -  n[        R                  R	                  X US9  [        S U 5       5      nS/U-  nSnU HG  nU H>  n[        U[        5      (       d   eXGR                     b   eXtUR                  '   US-  nM@     MI     X5:X  d   SU SU 35       eU$ )zL
Perform the all_gather to collect the autotune results from all the ranks.
N)groupc              3  8   #    U  H  n[        U5      v   M     g 7fro   )rL   ).0xs     r,   	<genexpr>_sync.<locals>.<genexpr>   s     0ZSVVZs   r   r	   zcount mismatch:  != )	r;   rN   rS   distributedall_gather_objectsumrG   r   r0   )rB   r\   
all_states
node_countrC   check_countother_resultsr|   s           r,   r?   r?      s    
 "#K; 269I9I9K0KJ	''
K'X0Z00J150CK##Ff&78888#LL1999-3V\\*1K	 $ $ $V(8D&VV$r+   c                  ^    \ rS rSrSrS
S jrSS jr\SS j5       r\SS j5       r	SS jr
Srg	)r      z
This is a serializer for the autotune choice. KernelTemplateChoice can't
be serialized directly (the template and inputs prevent this) so we need to
serialize it by parts and reconstruct later on.
c                    Xl         [        R                  U5      U l        U R	                  UR
                  5      U l        g ro   )r0   r   _template_uid_from_choicetemplate_uid_compute_kwargsdescriptionkwargs)rj   r0   r|   s      r,   rh   _SerializedChoice.__init__   s2    
-GGO**6+=+=>r+   c                &   U R                  5       n0 U R                  EnSU;   aF  UR                  5       S   R                  5       S   n[        R
                  " XTS   5      US   :H  US'   0 nSSKJnJn  U" U5      n	U" X9XaU5      n
U
R                  $ )z-
Deserialize the ChoiceCaller and return it.
BLOCK_Kr   r	   EVEN_K)DictKernelTemplateParamsKernelTemplateChoice)
_template_from_uidr   ry   get_sizesympygcdkernel_template_choicer   r   r|   )rj   r[   rZ   templater   kextra_kwargsr   r   paramsktcs              r,   rx   _SerializedChoice.get_choice   s    
 **, DKK
 q!**,Q/A$yy9,=>&BSSF8')	

 *&1"8\6Rzzr+   c                   U (       d  0 $ 0 nU R                  S5       H  nUR                  SS5      u  p4UR                  5       UR                  5       pCUS:X  a  SX'   MB  US:X  a  SX'   MN  UR                  5       (       a  [        U5      X'   Mr  UR	                  S5      (       a  UR                  S5      (       d   eUSS	 X'   M     U$ )
z9
Given a template description turn it into input kwargs.
,=r	   TrueTFalseF')splitstripisdigitr    
startswithendswith)r   r   cfgkeyvals        r,   r   !_SerializedChoice._compute_kwargs   s    
 I 46$$S)Cyya(HCyy{CIIKf}"#!#h~~c**s||C/@/@@@!!Bi * r+   c                *   [        U [        R                  5      (       a>  U R                  R                  S:X  a  g[        SU R                  R                  < 35      e[        U [        R                  5      (       a  g[        S[        U 5       35      e)zi
Given a ChoiceCaller figure out which template represents it. This
is reversed by _template_from_uid().
mmz!torch._inductor.kernel.mm.aten_mmzTODO: kernel z%torch._inductor.kernel.mm.mm_templatezTODO: )rG   r   ExternKernelCallerr|   rX   RuntimeErrorTritonTemplateCallertype)r|   s    r,   r   +_SerializedChoice._template_uid_from_choice  sw     f.AABB}}!!T):"]6==3E3E2H#IJJ 0 E EFF:V~677r+   c                    U R                   R                  S5      n[        5       US      nUSS  H  n[        X#5      nM     U$ )z"
See _template_uid_from_choice().
.r   r	   N)r   r   globalsgetattr)rj   partsobjr   s       r,   r   $_SerializedChoice._template_from_uid+  sH     !!'',ia!qrA#/C 
r+   )r0   r   r   N)r0   r    r|   r   r   r   )r[   r   rZ   r   r   zChoiceCaller | None)r   ra   r   z dict[str, Union[int, str, bool]])r|   r   r   ra   )r   r   )r$   r%   r&   r'   r(   rh   rx   staticmethodr   r   r   r*   r#   r+   r,   r   r      s>    ?
4  0 8 8$r+   r   c                   / nU R                    H  n[        U[        5      (       d  M  UR                  =nc  M+  [        U[        5      (       a  MB  [        U[
        5      (       d  MY  UR                  =nc  Mj  UR                  =nc  M{  UR                  [        5      nUc  M  UR                  (       d   eUR                  5       u  px[        UR                  U5      n	UR                  U	5        M     [        R                   n
[#        U5      U
R$                  :X  d!   S[#        U5       SU
R$                   S35       eU$ )zh
Go through the nodes in the scheduler and autotune the kernels which
should be autotuned by this rank.
z'incorrect local autotuned nodes found (r   ))ry   rG   r   noderW   r   origin_noderQ   getrR   r2   get_min_choicer   r0   appendr   rM   rL   r"   )rA   rB   r   
inner_noder   rQ   info
min_choice_r|   r]   s              r,   r>   r>   6  s0    13$..))#J,j"<==*&9::%111K:$$$D-xx12<zzz
 #113
"4::z:'A  D ((E E$?$?? 
1#6F2G1HUMhMhLiijk? r+   c                .   [        U R                  5       H  u  p#[        U[        5      (       d  M  [        UR                  =n[
        5      (       d  M?  UR                  c   eUR                  R                  [           nUR                  XR                     5      nUR                  n[        U[        5      (       d   eUR                  n[        U[        5      (       d   eUR                  UR                  :X  d   eU R                  XX#5        M     g)zc
Go through the nodes in the scheduler and autotune the nodes that were
autotuned on remote ranks.
N)	enumeratery   rG   r   r   rW   r   rQ   rR   r~   r0   datar   r   r[   _replace_node)	rA   rC   ir   	dist_noder   out_tensorboxout_storage
out_buffers	            r,   r@   r@   i  s     Y__-dM**z))#Y&@0
 0
 ((444((--.GHD%../?

/KLM',,Kk:6666$))Jj/::::$$	(8(8888##J1C .r+   )r   r   )rA   #torch._inductor.scheduler.Schedulerr   r   )r   zGenerator[None, None, None])
rX   ra   rY   zlist[ChoiceCaller]rZ   r   r[   r   r   zTensorBox | None)rB   list[_SerializedChoice]r   Sequence[_SerializedChoice])rA   r   r   r   )rA   r   rC   r   r   r   )9
__future__r   
contextlibdataclassestypingr   r   r   unittest.mockr   r   torch._loggingrS   torch.distributedr   r6   torch.fxtorch.utils._ordered_setr    r
   r   rU   r   r   r   r   r   r   r   r{   r   r   rA   r   virtualizedr   r   collections.abcr   r   rR   r   r)   	dataclassr   r.   r;   rD   contextmanagerrJ   r^   rW   r?   r   r>   r@   r#   r+   r,   <module>r      sG   "   , ,       / &   8 $ ' 3 3 )-& - # # #   
	8 8 8 
*4@JPB4!4 4p8Z Zz0200fD2D1D 
Dr+   