
    ȅi9                       S SK Jr  S SKrS SKJr  S SKJr  S SKJrJ	r	J
r
Jr  S SKrS SKJrJr  S SKJrJr  S SKJrJr  S S	KJr  S
SKJr  \
(       a  S SKJrJr  \R:                  R=                  \S5      r \R:                  R=                  \S5      r!\"\	\\#\RH                  4         r%\\"\   /\%4   r&\RN                  " SSS9 " S S5      5       r(\RN                  " SSS9 " S S5      5       r)\RN                  " SSS9 " S S5      5       r*    S1S jr+S2S jr,S3S jr-S4S jr.S5S jr/      S6S jr0        S7S jr1S8S jr2    S9S jr3    S9S  jr4S:S! jr5\RN                   " S" S#5      5       r6          S;S$ jr7S<S% jr8 " S& S'\5      r9            S=S( jr:      S>S) jr;\RN                  " SS*9 " S+ S,5      5       r<\RN                  " SS*9 " S- S.5      5       r=      S?S/ jr>S@S0 jr?g)A    )annotationsN)Callable)Enum)AnyOptionalTYPE_CHECKINGUnion)countersget_metrics_context)GraphPartitionMap	InputType)get_plain_tensorsis_fake)
OrderedSet   )is_using_cudagraph_partition)SequenceSet
cudagraphscudagraph_static_inputsT)frozenslotsc                  $    \ rS rSr% SrS\S'   Srg)
FunctionID    z9Unique counter of a function wrapped in cudagraphify_implintid N__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r       Y/home/james-whalen/.local/lib/python3.13/site-packages/torch/_inductor/cudagraph_utils.pyr   r       s
    ?Gr'   r   c                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
rg)PlaceholderInfo'   z
A serializable version of torch.fx.Node that contains information
pertinent to placeholder stack traces. We use these in logging and error messages
related to cudagraphs, and will cache these results.
strnameOptional[str]stack_tracelist[PlaceholderInfo]usersmutating_use_stack_tracer   Nr   r   r'   r(   r*   r*   '   s      I  ++r'   r*   c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S\S'   S\S'   Srg)WrappedFunction6   z
Represents a function that you want to record for CUDA graph replay,
with a little more metadata so we can identify if we have an applicable
CUDA graph in our CUDA graph tree for it.
zCallable[..., Any]modelSequence[int]static_input_idxsr   r   ztuple[torch.Tensor, ...]	constantsSequence[PlaceholderInfo]placeholdersmutated_input_idxsr   Nr   r   r'   r(   r4   r4   6   s,     $$N''++%%r'   r4   c                   [        U R                  5      S:X  a8  [        [        U R                  5      5      R                  R                  SS 5      $ U R                   Hg  nUR                  [        R                  R                  R                  R                  L d  M@  UR                  R                  SS 5      =n(       d  Me  Us  $    g )Nr   r/   )lenr1   nextitermetagettargettorchopsatencopy_default)placeholder_nodeuser/   s      r(   &get_mutating_use_stack_trace_from_noderK   F   s     !!"a'D)//0166::=$OO%%::--555!hhll=$??{?"" &
 r'   c                    U R                   $ N)r2   )placeholder_infos    r(   get_mutating_use_stack_tracerO   U   s    444r'   c                    U R                   nU R                  R                  SS 5      n/ nS nU R                  S:X  a0  U R                   Vs/ s H  n[        U5      PM     nn[        U 5      n[        XX45      $ s  snf )Nr/   placeholder)r-   rA   rB   opr1   to_placeholder_inforK   r*   )rI   r-   r/   r1   r2   is         r(   rS   rS   Y   s      D"''++M4@KE#m+1A1G1GH1GA$Q'1GH#I$
  4eNN Is   A9c                z    U R                    Vs/ s H  oR                  S:X  d  M  [        U5      PM!     sn$ s  snf )NrQ   )nodesrR   rS   )graphnodes     r(   get_placeholder_inforY   h   s9    .3kk.9dWW=U!D!k  s   88c                    SU  3$ )Nzskipping cudagraphs due to r   )reasons    r(   format_default_skip_messager\   n   s    (11r'   c                    SnU H  nX   n[        U5      =n(       d  M    O   [        S[        U5       S35      nU(       a  U SU 3$ U$ )N zmutated inputs (z instances). Found from : 
 )rO   r\   r>   )r;   mutation_indicesr/   idxrQ   msgs         r(   get_mutation_stack_tracerc   r   si     "$K"'6{CC;C  
 &
3/01=C (66Jr'   c                   [         R                  R                  R                  R                  (       a?  U R
                   Vs/ s H'  nX0R                  ;   a  M  U" X   5      (       a  M%  UPM)     nnOU R
                  n[        R                  SU R                  5        [        R                  SU5        U(       a  [        U R                  U5      $ S $ s  snf )Nz'check mutation static input indices: %sz#check mutation mutation indices: %s)rD   	_inductorconfigtritoncudagraph_treesr<   r8   static_inputs_logdebugrc   r;   )funcinputsis_cuda_graph_recorded_tensorra   r`   s        r(   check_for_mutationrn      s     $$44 ..+
.---  1=	 . 	 +
  22143I3I ACST  	!!2!24DE !+
s   CC*Cc                x    U R                    H*  nUR                  R                  SS 5      =n(       d  M(  Us  $    g )Nr/   )r1   rA   rB   )rX   rJ   r/   s      r(   _get_use_stack_tracerp      s5    zz((,,}d;;;;  r'   c                V   U R                  [        R                  " S5      S 5        [        5       (       a&  U R                  [        R                  " S5      S 5        U R	                  [        R                  " S5      5      =n(       a=  SUR
                   S3n[        U5      =n(       a  [        U SU 35      $ [        U5      $ [        U 5      S:X  a1  [        [        U R                  5       5      5      R                  S:X  a  g S U  5       n[        S	S
R                  U5       35      $ )NrA   cpuzcpu device ()r_   r   cudac              3  8   #    U  H  n[        U5      v   M     g 7frM   )repr).0keys     r(   	<genexpr>:check_multiple_devices_or_any_cpu_nodes.<locals>.<genexpr>   s     :&9sc&9s   zmultiple devices: z, )poprD   devicer   rB   r-   rp   r\   r>   r?   r@   keystypejoin)device_node_mappingcpu_noderb   r/   	keys_reprs        r(   'check_multiple_devices_or_any_cpu_nodesr      s    ELL0$7 $%%U 3T:&**5<<+>??x?X]]O1-.x88;8.#6H/VWW*3// 	 A%)..012776A:&9:I&);DIIi<P;Q'RSSr'   c                    [        U 5      $ rM   )r   )r   s    r(    check_lowering_disable_cudagraphr      s     33FGGr'   c                4   [         R                  U 5        [        S   S==   S-  ss'   [        R                  R
                  R                  R                  (       a  [        U 5      e[        5       nUR                  5       (       a  UR                  SU SS9  g g )Ninductorcudagraph_skipsr   cudagraph_skip_reasonT)	overwrite)cudagraphs_logwarningr
   rD   re   rf   rg   cudagraph_or_errorRuntimeErrorr   in_progressset)rb   metrics_contexts     r(   #log_cudagraph_skip_and_bump_counterr      s|    3Z*+q0+$$773)+O""$$3SDI %r'   c                  *    \ rS rSr% S\S'   SS jrSrg)BoxedDeviceIndex   Optional[int]valuec                D    Ub  [        U[        5      (       d   eXl        g rM   )
isinstancer   r   )self
device_idxs     r(   r   BoxedDeviceIndex.set   s    !Z
C%@%@@@
r'   )r   N)r   r   returnNone)r    r!   r"   r#   r%   r   r&   r   r'   r(   r   r      s     r'   r   c                n   [        S5      n[        R                  R                  R                  R
                  (       aZ  [        U5      nU Vs/ s H  ofU;  d  M
  UPM     nn[        U5      S:g  nU(       d  g [        U R                  5      n	[        X5      $ [        U5      S:g  nU(       d  S $ U$ s  snf )Nzmutated inputsr   )r\   rD   re   rf   rg   rh   r   r>   rY   rW   rc   )
gmmutated_inputsr<   r8   default_msgunique_idxsra   r`   has_mutationr;   s
             r(   3check_for_mutation_ignore_cuda_graph_managed_tensorr      s     ..>?K $$44 !23+=X+=CKAWC+=X+,1+BHH5'GG >*a/'t8[8 Ys   	B2B2c                    U R                   (       a  U R                   $ U R                   H"  nUR                   (       d  M  UR                   s  $    g)zE
Gets the first non-empty stack trace of a placeholder or its users.
N)r/   r1   )rQ   users     r(   get_placeholder_stack_tracer      sF     &&&!!### " r'   c                  .    \ rS rSrSrSrSrSrS	S jrSr	g)
CheckInvariantStatusi	  r            c                    U R                   S:X  a  gU R                   S:X  a  gU R                   S:X  a  gU R                    SU R                   3$ )NCudagraphManagedIdxMismatchz-cudagraph managed tensor data pointer changedStaticInputIdxMismatchz!static input data pointer changed&ExpectedDeadIndicesBeforeGraphMismatchz+expected dead indices before graph are livez: )r-   r   )r   s    r(   __str__CheckInvariantStatus.__str__  sK    9955BYY226YYBB@ii[4::,//r'   r   Nr   r,   )
r    r!   r"   r#   SUCCESSr   r   r   r   r&   r   r'   r(   r   r   	  s$    G #$  ./*0r'   r   c                   [        U5      [        U5      :X  a  [        U5      [        U 5      :X  d   S5       eU Vs/ s H  oQU   PM	     nnU Vs/ s H  oRU   PM	     nnU S3n[        [        Xg5      5       Hy  u  nu  p[        U	[        R
                  5      (       d   eX5   nU	R                  5       U
:w  d  MC  X   nU SUR                   SU
 SU	R                  5        S[        U5       S3
nM{     U$ s  snf s  snf )zq
Logs the mismatch between input data pointers and recorded data pointers.
This checks only idxs in target_idxs.
zClength mismatch between inputs, recorded_data_ptr, and placeholdersz.
zinput name: z. data pointer changed from z to z. input stack trace: 
)	r>   	enumeratezipr   rD   Tensordata_ptrr-   r   )r;   rl   recorded_data_ptrtarget_idxsmismatchrT   	t_tensorst_data_ptrs	error_msgtensorr   indexrQ   s                r(   log_data_ptr_mismatchr   !  s    v;#/00S[CDU5U MU %00KqKI01<=AQ'K=*C I!*3y+F!GF&%,,////??(&-K+\+*:*:); <--5Jd6??;L:M N&&A+&N%OrS  "H  1=s   C8C=c                N  ^ [        U R                  5       5      S-   mSU4S jjn[        R                  R                  R
                  R                  (       aM  T[        R                  R                  R
                  R                  :  a  [        R                  U" 5       5        gg)Nr   c                    > ST  S3$ )NzCUDAGraph supports dynamic shapes by recording a new graph for each distinct input size. Recording too many CUDAGraphs may lead to extra overhead. We have observed a0   distinct sizes. Please consider the following options for better performance: a) padding inputs to a few fixed number of shapes; or b) set torch._inductor.config.triton.cudagraph_skip_dynamic_graphs=True. Set torch._inductor.config.triton.cudagraph_dynamic_shape_warn_limit=None to silence this warning.r   )num_cudagraphss   r(   warn_msg4maybe_warning_due_to_dynamic_shape.<locals>.warn_msgF  s    00>/? @''		
r'   TFr   )	r>   r}   rD   re   rf   rg   "cudagraph_dynamic_shape_warn_limitr   r   )fn_cachenew_int_keyr   r   s      @r(   "maybe_warning_due_to_dynamic_shaper   @  st     )A-N

 	%%HH
//
 
 
'
'
J
JK 	xz*r'   )r   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   S	rg
)CudagraphCachedInfoi]  z
Info needed to realign inputs
r:   r;   list[Optional[str]]stack_tracesz	list[str]cudagraph_fail_reasonsr   Nr   r   r'   r(   r   r   ]  s     ,+%%%%r'   r   c                  L    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   S
\S'   Srg)CudagraphMetadataih  z&
Metadata for recording a CUDA graph.
r:   r;   OrderedSet[int]r8   r<   r   r   zdict[str, torch.Tensor]r9   r   Nr   r   r'   r(   r   r   h  s'     ,+&&''%%&&r'   r   c                v   / n[        5       n[        5       n[        U R                  5       H  u  pVXaR                  ;   a  UR	                  U5        XaR
                  ;   a  UR	                  U5        Ub  UR                  U   nO[        SU R                   SU 3S/ SS9nUR                  U5        M     / nU R                   H7  n	U	b   UR                  UR                  U	   5        M&  UR                  S5        M9     U R                   V
s0 s H  oUR                  U
   _M     nn
[        UUUUU5      $ s  sn
f )z
Convert the cudagraph metadata at the graph level to the graph partition level,
given the graph partition info (i.e., mapping from partition input/output index
to graph input/output index).
N
partition__placeholder_)r-   r/   r1   r2   )r   r   input_index_mappingr8   addr<   r;   r*   r   appendoutput_index_mappingr   constant_namesr9   r   )partition_mapmetadatapartition_placeholderspartition_static_input_idxspartition_mutated_input_idxspartition_input_idxgraph_input_idxrQ   partition_stack_tracesgraph_output_idxr-   partition_constantss               r(    get_partition_cudagraph_metadatar   u  s`     3=<4>L 09))1, 888'++,?@999(,,-@A&"//@K *!-"2"2!3=AT@UV )-	K 	%%k2'1*  )>>'"))(*?*?@P*QR"))$/	 ? 4A3O3O3O4h  &&3O   #$ 	s   D6c                   [        U [        R                  5      (       d
  [        5       $ [        5       n[	        U / S9 H~  n[        U5      [        R                  La  M!  [        U5      (       d+  UR                  (       d  UR                  R
                  S:w  a  M^   UR                  UR                  5       5        M     U$ ! [         a     M  f = f)zODebug helper that collects the data pointers of all CUDA tensors in the object.)outrt   )r   rD   r   r   r   r~   r   is_metar|   r   r   	Exception)objptrsbases      r(   collect_cuda_data_ptrsr     s    c5<<((|&LD!#2.:U\\)4==DLLDKK,<,<,F	HHT]]_% / K  		s   C
CC)rI   torch.fx.Noder   r.   )rN   r*   r   r.   )rI   r   r   r*   )rW   ztorch.fx.Graphr   r0   )r[   r,   r   r,   )r;   r:   r`   z&Union[AbstractSet[int], Sequence[int]]r   r,   )rk   r4   rl   list[InputType]rm   zCallable[[torch.Tensor], bool]r   r.   )rX   r   r   r.   )r   z!dict[torch.device, torch.fx.Node]r   r.   )rb   r,   r   r   )
r   ztorch.fx.GraphModuler   zOrderedSet[str]r<   r   r8   r7   r   r.   )rQ   r*   r   r.   )r;   r:   rl   r   r   zSequence[Optional[int]]r   r7   r   r   r   r,   )r   z)dict[tuple[int, ...], Callable[..., Any]]r   r   r   bool)r   r   r   r   r   r   )r   objectr   r   )@
__future__r   dataclassescollections.abcr   enumr   typingr   r   r   r	   rD   torch._dynamo.utilsr
   r   torch._inductor.utilsr   r   torch._subclasses.fake_tensorr   r   torch.utils._ordered_setr   utilsr   r   r   AbstractSet_logginggetArtifactLoggerr    r   ri   listr   r   
OutputType	ModelType	dataclassr   r*   r4   rK   rO   rS   rY   r\   rc   rn   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r'   r(   <module>r     s   "  $  6 6  = > D / / < 11(LINN44' 
 (5ell!2345
d9o&
23	 d$/  0 d$/, , 0, d$/& & 0&#5O2+< 	(
 $B 	>T:TT8H:HH	J      99#9 (9 %	9
 9004 00+ / 	
 # 	>7 
: d#& & $& d#	' 	' $	'3$33 3lr'   