
    ȅiUZ                        % S SK r S SKrS SKrS SKrS SKrS SKJrJrJr  S SK	J
r
JrJr  S SKJr  SSKJrJr  SSKJr  \R&                  " \5      r0 r\\\\   4   \S'   S	\S
\\   4S jrS\S\
4   S
\\   4S jr\" S5       SSS.S	\S\\   S\\\\   4   S\\   S
\4
S jjj5       r\R>                  " 5       r Sr!\RD                  S\#S
\S   4S j5       r$S
\#4S jr%S\S
\
4S jr&\" S5      S\S
\
4S j5       r'g)    N)Callable	GeneratorIterable)AnyOptionalUnion)
exposed_in   )	custom_opCustomOpDef)infer_schematriton_ops_to_kernelsnamereturnc                 .    [         R                  U / 5      $ N)r   get)r   s    O/home/james-whalen/.local/lib/python3.13/site-packages/torch/_library/triton.pyget_triton_kernels_for_opr      s     $$T2..    fn.c           
         ^^ Sm  S	S[         S[        4   S[        [           S-  S[        S[        [
           4UU4S jjjmT" U 5      $ )
aO  
Inspect the source of an arbitrary callable passed to torch._library.triton_op,
and grab all of the triton kernels that are wrapped inside of it.

This function traces local variable assignments to handle patterns like:
    kernel_fn = _my_kernel  # global JITFunction
    wrapped = some_wrapper(kernel_fn)
    capture_triton(wrapped)[grid](...)

It also recursively analyzes called functions to find triton kernels hidden
behind helper function calls.

That said, it is best effort. There are cases (e.g., recursion > MAX_RECURSION_DEPTH)
that are not accounted for, so keep that in mind.
   Nr   .visited_fnsdepthr   c                   >^^^^^^^^  SSK Jm  SSKJm  Uc
  [        5       n[        U 5      nX1;   a  / $ UT:  a  [
        R                  ST5        / $ UR                  U5         [        R                  " U 5      nSSKJn  U" 5       nUR#                  USS9  [$        R&                  " UR)                  5       5      n " S	 S
[$        R*                  5      nU" 5       mTR-                  U5        [        R.                  " U 5      m0 mTR1                  TR2                  5        [5        U S5      (       a  TR1                  U R6                  5        S[$        R8                  S[:        [<           4S jmS[>        S[>        S -  4UU4S jjm SS[<        S[        [<           S -  S[:        [>           4UUUUUU4S jjjm/ n	[        5       n
TR@                   HH  nT" U5      nU H7  n[        U5      nX;  d  M  U
R                  U5        U	RC                  U5        M9     MJ     TRD                   H  nS nUT;   a  TU   nOUTRF                  ;   a  TRF                  U   nUc  SSK$J%n  UU;   a  UU   RL                  nUb  [O        U5      (       d  Mc  [5        US5      (       d  Mv   T" UXS-   5      nU H8  n[        U5      nUU
;  d  M  U
R                  U5        U	RC                  U5        M:     M     U	$ ! [         a    [
        R                  S5        / s $ f = f! [        [        4 a    / s $ f = f! [P         a    [
        R                  SUSS9   GM)  f = f)Nr   	AutotunerJITFunctionz.Triton not available, find_triton_kernels = []z7reached max recursion depth (%s) in find_triton_kernels)IndentedBufferT)stripc                   j    \ rS rSrS	S jrS\R                  SS4S jrS\R                  SS4S jr	Sr
g)
Fget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.VisitorT   r   Nc                 .    / U l         0 U l        / U l        g r   )triton_kernelsassignmentscalled_functions)selfs    r   __init__Oget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.Visitor.__init__U   s    13#>@ 35%r   nodec                    UR                    Hc  n[        U[        R                  5      (       d  M$  U R                  R                  UR                  / 5      R                  UR                  5        Me     U R                  U5        g r   )
targets
isinstanceastNamer(   
setdefaultidappendvaluegeneric_visit)r*   r-   targets      r   visit_AssignSget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.Visitor.visit_Assign\   sZ    "llF!&#((33((33FIIrBII$**U + ""4(r   c                    Sn[        UR                  [        R                  5      (       Ga)  UR                  n[        UR                  [        R                  5      (       Ga  [        UR                  R                  [        R
                  5      (       a  UR                  R                  R                  S:X  a  UR                  R                  S:X  a  UR                  U;   aq  UR                  (       a^  [        UR                  S   [        R
                  5      (       a2  U R                  R                  UR                  S   R                  5        GO[        UR                  R                  [        R                  5      (       a  [        UR                  R                  R                  [        R
                  5      (       a  UR                  R                  R                  R                  S:X  ab  UR                  R                  R                  S:X  a>  U R                  R                  UR                  R                   SUR                   35        O[        UR                  [        R
                  5      (       a  UR                  R                  U;   ap  UR                  (       a^  [        UR                  S   [        R
                  5      (       a2  U R                  R                  UR                  S   R                  5        O/U R                  R                  UR                  R                  5        U R                  U5        g )N)capture_tritonwrap_tritontorch_libraryr   opsz::)r0   funcr1   	Attributer6   r2   r4   attrargsr'   r5   r)   r7   )r*   r-   triton_func_namesrC   s       r   
visit_CallQget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.Visitor.visit_Callb   s   $E!dii7799D!$**cmm<<&tzz'7'7BB $

 0 0 3 3w > $

: = $		-> >#yyZ		!chh-O-O $ 3 3 : :499Q<?? K&tzz'7'7GG *4::+;+;+A+A388 L L $

 0 0 6 6 9 9W D $

 0 0 5 5 > 1188#'::??"32dii[ A
  		38844yy||'8899DIIaL#(()K)K //66tyy|G --44TYY\\B""4(r   )r(   r)   r'   )r   N)__name__
__module____qualname____firstlineno__r+   r1   Assignr9   CallrF   __static_attributes__ r   r   Visitorr$   T   s5    6) ) ) )sxx  )D  )r   rP   __globals__exprr   c                 p   ^ / m " U4S jS[         R                  5      nU" 5       R                  U 5        T$ )z3Extract all Name references from an AST expression.c                   h   > \ rS rSrS\R
                  SS4U 4S jjrS\R                  SS4S jrSr	g)mget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.extract_names_from_expr.<locals>.NameExtractor   r-   r   Nc                 <   > TR                  UR                  5        g r   )r5   r4   )r*   r-   namess     r   
visit_Namexget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.extract_names_from_expr.<locals>.NameExtractor.visit_Name   s    LL)r   c                 &    U R                  U5        g r   )r7   )r*   r-   s     r   rF   xget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.extract_names_from_expr.<locals>.NameExtractor.visit_Call   s    &&t,r   rO   )
rH   rI   rJ   rK   r1   r2   rY   rM   rF   rN   )rX   s   r   NameExtractorrU      s1    *388 * *-388 - -r   r]   )r1   NodeVisitorvisit)rR   r]   rX   s     @r   extract_names_from_exprVget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.extract_names_from_expr   s/    !E- - O!!$'Lr   objc                    > [        U TT45      (       a  U $ [        U 5      (       a0  [        U S5      (       a  U R                  n[        UT5      (       a  U$ g)zACheck if obj is a triton kernel or wrapper and return the kernel.r   N)r0   callablehasattrr   )rb   innerr   r    s     r   resolve_to_kernelPget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.resolve_to_kernel   sM    #Y788
}}d!3!3e[11 Lr   r   visitedc                   > Uc
  [        5       nX;   a  / $ UR                  U 5        U T;   a)  T" TU    5      nUc  [        R                  SU 5        / $ U/$ U T	R                  ;   a3  T" T	R                  U    5      nUc  [        R                  SU 5        / $ U/$ U T	R
                  ;   a3  T" T	R
                  U    5      nUc  [        R                  SU 5        / $ U/$ U T
R                  ;  a  [        R                  SU 5        / $ / nT
R                  U     H-  nT" U5      nU H  nT" Xa5      nUR                  U5        M     M/     U$ )a  
Trace a name through local assignments back to global triton kernels.

This handles patterns like:
    kernel_fn = _my_kernel  # global
    wrapped = wrapper(kernel_fn)
    autotuned = autotune(wrapped)
    capture_triton(autotuned)  # traces back to _my_kernel
z4failed to resolve all_globals[%s] to a triton kernelz?failed to resolve closure_vars.nonlocals[%s] to a triton kernelz>failed to resolve closure_vars.builtins[%s] to a triton kernelz%s not in collector.assignments)setaddloggerwarning	nonlocalsbuiltinsr(   extend)r   ri   kernelresultsrhs_expr
referencedref_nametracedall_globalsclosure_vars	collectorr`   rg   trace_to_global_kernelss           r   r{   Vget_inner_triton_kernels.<locals>.find_triton_kernels.<locals>.trace_to_global_kernels   sj    %	KK {"*;t+<=>NNNPT Ix |---*<+A+A$+GH>NNY Ix |,,,*<+@+@+FG>NNX Ix 9000@$G	 %'G%11$74X>
 *H4XGFNN6* !+ 8 Nr   )OPDEFS__code__r
   z$failed to analyze called function %s)exc_infor   ))triton.runtime.autotunerr   triton.runtime.jitr    ImportErrorrm   rn   rk   r4   debugrl   inspect	getsourceOSError	TypeErrortorch._inductor.utilsr!   splicer1   parsegetrawvaluer^   r_   getclosurevarsupdateglobalsre   rQ   rR   liststrobjectr'   r5   r)   ro   torch._library.custom_opsr}   _abstract_fnrd   	Exception)r   r   r   fn_idsourcer!   buffertreerP   resolvedseen_idsr   traced_objectsrb   obj_id	func_namefunc_objr}   nested_kernelsrr   	kernel_idr   r    rx   ry   rz   r`   rg   r{   MAX_RECURSION_DEPTHfind_triton_kernelss                        @@@@@@@@r   r   5get_inner_triton_kernels.<locals>.find_triton_kernels,   s*   
	:6 %K2I&&LLI# I	&&r*F 	9!fD)yy++-..	)coo .	)` I	--b1 ')<//02}%%r~~.	#(( 	tCy 			6 		ftm 		 		 37A	A	 #C4A	&\A	 A	H "$ U,,D4T:N%C)LL(OOC(	 & - #33IHK'&y1l444'11)<<&%i0==H x'9'9 8Z00
!4X{TUI!V,F "6
I 0 Y/ /	 -1 4D K  	NNKLI	, # 	I	T  :IPT  s;   K1 &L ! L0&L01!LLL-,L-0MM)Nr   )r   r   rk   intr   r   )r   r   r   s    @@r   get_inner_triton_kernelsr      sj    $  (,mS#XmX_m m 
f	m m^ r""r   ztorch.library)schemamutates_argsr   c               \   ^ ^ S[         S[        4   S[        4UU 4S jjnUc  U$ U" U5      $ )a  Create a custom operator whose implementation is backed by 1+ triton kernels.

This is a more structured way of using triton kernels with PyTorch.
Prefer using triton kernels with no ``torch.library`` custom operator wrappers
(like :func:`torch.library.custom_op`, :func:`torch.library.triton_op`) because
that is simpler;
only use :func:`torch.library.custom_op`/:func:`torch.library.triton_op` if you
want to create an operator that behaves like PyTorch built-in operators.
For example, you may use a ``torch.library`` wrapper API to define the
behavior of the triton kernel when passed a tensor subclass or under
a TorchDispatchMode.

Use :func:`torch.library.triton_op` instead of :func:`torch.library.custom_op`
when the implementation
consists of 1+ triton kernels. :func:`torch.library.custom_op` treats
custom operators as opaque (:func:`torch.compile` and
:func:`torch.export.export` will never trace into them), but ``triton_op``
makes the implementation visible to these subsystems, allowing them
to optimize the triton kernel(s).

Note that ``fn`` must only consist of calls to PyTorch-understood
operators and triton kernels. Any triton kernels called inside ``fn``
must be wrapped in a call to :func:`torch.library.wrap_triton`.

Args:
    name (str): A name for the custom op that looks like "{namespace}::{name}",
        e.g. "mylib::my_linear". The name is used as the op's stable identifier
        in PyTorch subsystems (e.g. torch.export, FX graphs).
        To avoid name collisions, please use your project name as the namespace;
        e.g. all custom ops in pytorch/fbgemm use "fbgemm" as the namespace.
    mutates_args (Iterable[str] or "unknown"): The names of args that the function mutates.
        This MUST be accurate, otherwise, the behavior is undefined. If "unknown",
        it pessimistically assumes that all inputs to the operator are being mutated.
    schema (str | None): A schema string for the operator. If None
        (recommended) we'll infer a schema for the operator from its type
        annotations. We recommend letting us infer a schema unless you
        have a specific reason not to.
        Example: "(Tensor x, int y) -> (Tensor, Tensor)".

Example::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> import torch
    >>> from torch.library import triton_op, wrap_triton
    >>>
    >>> import triton
    >>> from triton import language as tl
    >>>
    >>> @triton.jit
    >>> def add_kernel(
    >>>     in_ptr0,
    >>>     in_ptr1,
    >>>     out_ptr,
    >>>     n_elements,
    >>>     BLOCK_SIZE: "tl.constexpr",
    >>> ):
    >>>     pid = tl.program_id(axis=0)
    >>>     block_start = pid * BLOCK_SIZE
    >>>     offsets = block_start + tl.arange(0, BLOCK_SIZE)
    >>>     mask = offsets < n_elements
    >>>     x = tl.load(in_ptr0 + offsets, mask=mask)
    >>>     y = tl.load(in_ptr1 + offsets, mask=mask)
    >>>     output = x + y
    >>>     tl.store(out_ptr + offsets, output, mask=mask)
    >>>
    >>> @triton_op("mylib::add", mutates_args={})
    >>> def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    >>>     output = torch.empty_like(x)
    >>>     n_elements = output.numel()
    >>>
    >>>     def grid(meta):
    >>>         return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
    >>>
    >>>     # NB: we need to wrap the triton kernel in a call to wrap_triton
    >>>     wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16)
    >>>     return output
    >>>
    >>> @torch.compile
    >>> def f(x, y):
    >>>     return add(x, y)
    >>>
    >>> x = torch.randn(3, device="cuda")
    >>> y = torch.randn(3, device="cuda")
    >>>
    >>> z = f(x, y)
    >>> assert torch.allclose(z, x + y)

r   .r   c           
         >^  U 4S jn[        TUT[        T TS9S9nSSKJn  UR	                  T 5        U 4S jn[        T 5      nU[        T'   UR                  X45        U$ )Nc                  `   > [        S5         T" U 0 UD6sS S S 5        $ ! , (       d  f       g = f)NF)set_wrap_triton_enabled)rD   kwargsr   s     r   
backend_fn*triton_op.<locals>.dec.<locals>.backend_fn  s&     )/4*6* 0//s   
-)r   )r   r      )FunctionalTensorModec                   > SSK Jn  U" 5       (       a  U R                  XX45      $ SS KnU Vs/ s H]  n[	        XvR
                  R                  5      (       a  M)  UUR                  UR
                  R                  R                  4;  d  M[  UPM_     nnU(       a  [        $ U    T	" U0 UD6sS S S 5        $ s  snf ! , (       d  f       g = f)Nr   )(custom_triton_ops_decomposition_disabled)torch.export._tracer   __torch_dispatch__torch._subclasses
issubclass_subclasses
FakeTensorTensorfunctional_tensorFunctionalTensorNotImplemented)
modeoptypesrD   r   r   r>   tunrecognized_typesr   s
            r   functional_decomp1triton_op.<locals>.dec.<locals>.functional_decomp  s    & U799..r$GG ) #	&"%a):):)E)EF  ));;LL " # 	& &))t.v. T	& Ts   (B5.B5B5#B::
C)r   r   _subclasses.functional_tensorr   register_faker   r   register_torch_dispatch)r   r   resultr   r   r'   r   r   s   `     r   dectriton_op.<locals>.dec  sn    	+ %>	
 	I 	R *	/X 2"5&4d#&&';Or   )r   r   r   )r   r   r   r   r   s   ` `  r   	triton_opr     s?    DGf% G+ G GR 
z
2wr   Tenabled)NNNc              #   v   #     [        5       nU [        l        Sv   U[        l        g! W[        l        f = f7f)a;  If triton kernels annotated with @wrap_triton should dispatch via HOP
or go straight to the triton kernel execution.

We have this switch because eager-mode performance of HOP dispatch is slow
enough to matter (~1ms) and we know that wrap_triton isn't necessary in
some situations (eager-mode with regular Tensors)
N)is_wrap_triton_enabledwrap_triton_enabledr6   )r   prevs     r   r   r     s0     )%'$+!$(!D!s   9) 969c                  ,    [        [        S[        5      $ )Nr6   )getattrr   wrap_triton_enabled_defaultrO   r   r   r   r     s    &1LMMr   triton_kernelc                    [        U 5      $ )z(This API has been renamed to wrap_triton)r=   )r   s    r   r<   r<     s    }%%r   c                    SSK Jn  SSKJn  SSKJn  [        XU45      (       d  [        S5      e[        5       (       d  U $ U" U SS5      $ )ay  Allows capture of a triton kernel into a graph via make_fx or
non-strict ``torch.export``.

These technologies perform Dispatcher-based tracing (via
``__torch_dispatch__``) and cannot see calls to raw triton kernels.
The ``wrap_triton`` API wraps a triton kernel into a callable that
can actually be traced into a graph.

Please use this API together with :func:`torch.library.triton_op`.

Examples:

    >>> # xdoctest: +SKIP
    >>> import torch
    >>> import triton
    >>> from triton import language as tl
    >>> from torch.fx.experimental.proxy_tensor import make_fx
    >>> from torch.library import wrap_triton
    >>>
    >>> @triton.jit
    >>> def add_kernel(
    >>>     in_ptr0,
    >>>     in_ptr1,
    >>>     out_ptr,
    >>>     n_elements,
    >>>     BLOCK_SIZE: "tl.constexpr",
    >>> ):
    >>>     pid = tl.program_id(axis=0)
    >>>     block_start = pid * BLOCK_SIZE
    >>>     offsets = block_start + tl.arange(0, BLOCK_SIZE)
    >>>     mask = offsets < n_elements
    >>>     x = tl.load(in_ptr0 + offsets, mask=mask)
    >>>     y = tl.load(in_ptr1 + offsets, mask=mask)
    >>>     output = x + y
    >>>     tl.store(out_ptr + offsets, output, mask=mask)
    >>>
    >>> def add(x, y):
    >>>     output = torch.empty_like(x)
    >>>     n_elements = output.numel()
    >>>
    >>>     def grid_fn(meta):
    >>>         return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
    >>>
    >>>     wrap_triton(add_kernel)[grid_fn](x, y, output, n_elements, 16)
    >>>     return output
    >>>
    >>> x = torch.randn(3, device="cuda")
    >>> y = torch.randn(3, device="cuda")
    >>> gm = make_fx(add)(x, y)
    >>> print(gm.code)
    >>> # def forward(self, x_1, y_1):
    >>> #     empty_like = torch.ops.aten.empty_like.default(x_1, pin_memory = False)
    >>> #     triton_kernel_wrapper_mutation_proxy = triton_kernel_wrapper_mutation(
    >>> #         kernel_idx = 0, constant_args_idx = 0,
    >>> #         grid = [(1, 1, 1)], kwargs = {
    >>> #             'in_ptr0': x_1, 'in_ptr1': y_1, 'out_ptr': empty_like,
    >>> #             'n_elements': 3, 'BLOCK_SIZE': 16
    >>> #         })
    >>> #     return empty_like

r   r   r   )TraceableTritonKernelWrapperzPwrap_triton only works on functions annotated with triton.jit or triton.autotuneN)	r   r   r   r    *torch._higher_order_ops.triton_kernel_wrapr   r0   RuntimeErrorr   )r   r   r    r   s       r   r=   r=     sN    ~ 3.Wm9%=>>^
 	
 "##'tTBBr   r   )(r1   
contextlibr   logging	threadingcollections.abcr   r   r   typingr   r   r   torch.utils._exposed_inr	   
custom_opsr   r   r   	getLoggerrH   rm   r   dictr   r   r   __annotations__r   r   r   localr   r   contextmanagerboolr   r   r<   r=   rO   r   r   <module>r      s   
     9 9 ' ' . . & 
		8	$13 tCf-. 3/C /DL /C##s(!3 C#V C#L O "m !m
mm
 Xc]*+m SMm m m`  oo' "  )T )i8H.I ) ) N N&( &# &
 OICx ICs IC ICr   