
    |h                        S SK rS SKrS SKrS SKrS SKrS SKJrJrJ	r	J
r
JrJr  S SKJrJr  S SKJr  S SKJr  S SKJrJr  S SKJrJr  S SKJr  S S	KJr  S S
KJrJ r   S SK!J"r"  S SK#J$r$  S SK%J&r&  S SK'J(r(J)r)  S SK*J
r+  S SK,J-r-  S SK,J.r.  S SK/J0r0  / SQr1 " S S\Rd                  5      r3 " S S\45      r5 " S S5      r6 " S S\5      r7 " S S\5      r8 " S S\\Rd                  5      r9g)     N)config	serializesigutilstypestypingutils)Cache	CacheImpl)global_compiler_lock)
Dispatcher)NumbaPerformanceWarningNumbaValueError)Purposetypeof)get_current_device)wrap_arg)compile_cudaCUDACompiler)driver)get_context)cuda_target)missing_launch_config_msgnormalize_kernel_dimensions)r   cuda)_dispatcher)warn)hsinhcoshloghlog10hlog2hexphexp10hexp2hsqrthrsqrthfloorhceilhrcphrinthtrunchdivc                   4  ^  \ rS rSrSr\   SU 4S jj5       r\S 5       r\S 5       r	S r
\S 5       r\S 5       r\U 4S	 j5       rS
 rS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rS rS rS rS rSS jrSS jrSS jrS rSrU =r$ )_Kernel(   zx
CUDA Kernel specialized for a given set of argument types. When called, this
object launches the kernel on the device.
c                 N  > U(       a  [        S5      e[        TU ]	  5         SU l        S U l        Xl        X l        X@l        XPl        U=(       d    / U l	        UU
(       a  SOSS.n[        5       R                  n[        U R
                  [        R                  U R                  U R                  UUUUUS9	nUR                  nU R
                  R                   nUR"                  nUR$                  nUR'                  UR(                  UR*                  XEUUUU	5      u  nnU(       d  / nSUR-                  5       ;   U l        U R.                  (       a  SUl        [2         Vs/ s H  nS	U 3UR-                  5       ;   d  M  UPM      nnU(       aq  [4        R6                  R9                  [4        R6                  R;                  [<        5      5      n[4        R6                  R?                  US
5      nURA                  U5        U H  nURC                  U5        M     URD                  U l#        URH                  U l$        URJ                  U l&        UU l'        URP                  U l(        Xl        UR*                  U l        URR                  U l)        / U l*        / U l+        / U l,        g s  snf )Nz,Cannot compile a device function as a kernelF   r   )fastmathoptdebuglineinfoinliner3   nvvm_optionscccudaCGGetIntrinsicHandleT__numba_wrapper_zcpp_function_wrappers.cu)-RuntimeErrorsuper__init__
objectmodeentry_pointpy_funcargtypesr6   r7   
extensionsr   compute_capabilityr   r   voidtarget_context__code__co_filenameco_firstlinenoprepare_cuda_kernellibraryfndescget_asm_strcooperativeneeds_cudadevrtcuda_fp16_math_funcsospathdirnameabspath__file__joinappendadd_linking_filename
entry_name	signaturetype_annotation_type_annotation_codelibrarycall_helperenvironment_referenced_environmentsliftedreload_init)selfrB   rC   linkr6   r7   r8   r3   rD   max_registersr4   devicer9   r:   crestgt_ctxcodefilenamelinenumlibkernelfnresbasedirfunctions_cu_pathfilepath	__class__s                             O/home/james-whalen/.local/lib/python3.13/site-packages/numba/cuda/dispatcher.pyr?   _Kernel.__init__.   sI   
 MNN     
 $* !1

  !44DLL%**dmm"&**%-#)%-)5!# %%||$$##%%11$,,27<2:G2?AV
 D 69JJ"&C0 B0b%bT*coo.?? 0 B ggoobggooh&?@G "W-G!IKK)*H  *  !++ $ 4 4++ &kk++(*%;Bs    J"?J"c                     U R                   $ N)r_   re   s    rv   rL   _Kernel.library   s           c                     U R                   $ ry   )r^   rz   s    rv   r]   _Kernel.type_annotation   s    $$$r|   c                     U R                   $ ry   )rb   rz   s    rv   _find_referenced_environments%_Kernel._find_referenced_environments   s    ,,,r|   c                 6    U R                   R                  5       $ ry   )rG   codegenrz   s    rv   r   _Kernel.codegen   s    ""**,,r|   c                 @    [        U R                  R                  5      $ ry   )tupler\   argsrz   s    rv   argument_types_Kernel.argument_types   s    T^^(())r|   c	                    > U R                  U 5      n	[        X	]  5         SU	l        Xl        X)l        X9l        SU	l        XIl        XYl	        Xil
        Xyl        Xl        U	$ )
Rebuild an instance.
N)__new__r>   r?   rA   rO   r[   r\   r^   r_   r6   r7   r`   rD   )clsrO   rZ   r\   codelibraryr6   r7   r`   rD   instanceru   s             rv   _rebuild_Kernel._rebuild   sb     ;;s#c%'#*"&$(! +$*(r|   c                     [        U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  S9$ )z
Reduce the instance for serialization.
Compiled definitions are serialized in PTX form.
Type annotation are discarded.
Thread, block and shared memory configuration are serialized.
Stream information is discarded.
)rO   rZ   r\   r   r6   r7   r`   rD   )	dictrO   r[   r\   r_   r6   r7   r`   rD   rz   s    rv   _reduce_states_Kernel._reduce_states   sL      0 0t"nn$:K:K**t}} $ 0 0T__N 	Nr|   c                 8    U R                   R                  5         g)z'
Force binding to current CUDA context
N)r_   
get_cufuncrz   s    rv   bind_Kernel.bind   s     	$$&r|   c                 ^    U R                   R                  5       R                  R                  $ )z>
The number of registers used by each thread for this kernel.
)r_   r   attrsregsrz   s    rv   regs_per_thread_Kernel.regs_per_thread   s%    
   ++-33888r|   c                 ^    U R                   R                  5       R                  R                  $ )z4
The amount of constant memory used by this kernel.
)r_   r   r   constrz   s    rv   const_mem_size_Kernel.const_mem_size   %    
   ++-33999r|   c                 ^    U R                   R                  5       R                  R                  $ )z=
The amount of shared memory used per block for this kernel.
)r_   r   r   sharedrz   s    rv   shared_mem_per_block_Kernel.shared_mem_per_block   s%    
   ++-33:::r|   c                 ^    U R                   R                  5       R                  R                  $ )z*
The maximum allowable threads per block.
)r_   r   r   
maxthreadsrz   s    rv   max_threads_per_block_Kernel.max_threads_per_block   s%    
   ++-33>>>r|   c                 ^    U R                   R                  5       R                  R                  $ )z=
The amount of local memory used per thread for this kernel.
)r_   r   r   localrz   s    rv   local_mem_per_thread_Kernel.local_mem_per_thread   r   r|   c                 6    U R                   R                  5       $ )z&
Returns the LLVM IR for this kernel.
)r_   get_llvm_strrz   s    rv   inspect_llvm_Kernel.inspect_llvm   s       --//r|   c                 4    U R                   R                  US9$ )z'
Returns the PTX code for this kernel.
)r:   )r_   rN   )re   r:   s     rv   inspect_asm_Kernel.inspect_asm   s       ,,,33r|   c                 6    U R                   R                  5       $ )z^
Returns the CFG of the SASS for this kernel.

Requires nvdisasm to be available on the PATH.
)r_   get_sass_cfgrz   s    rv   inspect_sass_cfg_Kernel.inspect_sass_cfg   s       --//r|   c                 6    U R                   R                  5       $ )zX
Returns the SASS code for this kernel.

Requires nvdisasm to be available on the PATH.
)r_   get_sassrz   s    rv   inspect_sass_Kernel.inspect_sass   s       ))++r|   c                     U R                   c  [        S5      eUc  [        R                  n[	        U R
                  < SU R                  < 3US9  [	        SUS9  [	        U R                   US9  [	        SUS9  g)
Produce a dump of the Python source of this function annotated with the
corresponding Numba IR and type information. The dump is written to
*file*, or *sys.stdout* if *file* is *None*.
Nz Type annotation is not available filezP--------------------------------------------------------------------------------zP================================================================================)r^   
ValueErrorsysstdoutprintr[   r   )re   r   s     rv   inspect_types_Kernel.inspect_types  sg       (?@@<::D$*=*=>TJhT"d##$/hT"r|   c                     [        5       nU R                  R                  5       n[        U[        5      (       a  [
        R                  " S U5      nUR                  UUU5      nUR                  R                  nXV-  $ )a  
Calculates the maximum number of blocks that can be launched for this
kernel in a cooperative grid in the current context, for the given block
and dynamic shared memory sizes.

:param blockdim: Block dimensions, either as a scalar for a 1D block, or
                 a tuple for 2D or 3D blocks.
:param dynsmemsize: Dynamic shared memory size in bytes.
:return: The maximum number of blocks in the grid.
c                 
    X-  $ ry    )xys     rv   <lambda>5_Kernel.max_cooperative_grid_blocks.<locals>.<lambda>&  s    QUr|   )
r   r_   r   
isinstancer   	functoolsreduce$get_active_blocks_per_multiprocessorrh   MULTIPROCESSOR_COUNT)re   blockdimdynsmemsizectxcufuncactive_per_smsm_counts          rv   max_cooperative_grid_blocks#_Kernel.max_cooperative_grid_blocks  sr     m""--/h&& ''(:HEH@@AIALN ::22''r|   c                 F  ^ U R                   R                  5       mU R                  (       a{  TR                  S-   nTR                  R                  U5      u  pxU[        R                  " [        R                  5      :X  d   e[        R                  " 5       n	UR                  SUS9  / n
/ n[        U R                  U5       H  u  pU R                  XXJU5        M     [        R                  (       a   [        R                  R!                  S5      nOS nU=(       a    UR"                  =(       d    Un[        R$                  " TR"                  /UQUQUPUPUP7SU R&                  06  U R                  (       Ga  [        R(                  " [        R*                  " W	5      WW5        U	R,                  S:w  a  U4S jnS Vs/ s H  nU" SU-   5      PM     nnS Vs/ s H  nU" SU-   5      PM     nnU	R,                  nU R.                  R1                  U5      u  nnnUc  S	nO4Uu  nnn[2        R4                  R7                  U5      nS
U< SU< SU< S3nU< SU< SU< 3nU(       a  U< SUS   < 34USS  -   nOU4nU" U6 eU
 H
  nU" 5         M     g s  snf s  snf )N__errcode__r   )streamrO   c                    > TR                   R                  TR                  < SU < S35      u  p[        R                  " 5       n[
        R                  " [        R                  " U5      X5        UR                  $ )N__)	moduleget_global_symbolrZ   ctypesc_intr   device_to_host	addressofvalue)rZ   memszvalr   s       rv   load_symbol#_Kernel.launch.<locals>.load_symbolS  s\    $mm==?E{{?C?E FGC !,,.C))&*:*:3*?I99$r|   zyxtidctaid zIn function z, file z, line z, ztid=z ctaid=z:    )r_   r   r6   rZ   r   r   r   sizeofr   memsetzipr   _prepare_argsr   USE_NV_BINDINGbindingCUstreamhandlelaunch_kernelrO   r   r   r   r`   get_exceptionrR   rS   rU   )re   r   griddimr   r   	sharedmemexcnameexcmemexcszexcvalretr
kernelargstvzero_streamstream_handler   ir   r   rk   excclsexc_argsloclocinfosymrt   linenoprefixwbr   s                                 @rv   launch_Kernel.launch-  sk   ""--/::kkM1G"MM;;GDMFFMM&,,7777\\^FMM!FM+ 
++T2DAqV:> 3    ..11!4KK06==?K 	V]] 	;%	;&	; '	; +		;
 (	; *.)9)9	; :::!!&"2"26":FEJ||q % 8==u!{519-u=;@A5aWq[15A||(,(8(8(F(Ft(L%#; G,/)C6!wwx8HFIFNFLOG 18eD,2HQK @B  %H  &wHh'' BD / >As   JJc                 <   [        U R                  5       H  nUR                  UUUUS9u  pM     [        U[        R
                  5      (       Ga  [        U5      R                  XC5      n[        R                  n[        R                  " S5      n	[        R                  " S5      n
U" UR                  5      nU" UR                  R                  5      n[        R                  " U5      n[        R                   (       a  [#        U5      n[        R                  " U5      nUR%                  U	5        UR%                  U
5        UR%                  U5        UR%                  U5        UR%                  U5        ['        UR(                  5       H'  nUR%                  U" UR*                  U   5      5        M)     ['        UR(                  5       H'  nUR%                  U" UR,                  U   5      5        M)     g[        U[        R.                  5      (       a+  [1        [        SU-  5      " U5      nUR%                  U5        gU[        R2                  :X  aY  [        R4                  " [6        R2                  " U5      R9                  [6        R:                  5      5      nUR%                  U5        gU[        R<                  :X  a(  [        R>                  " U5      nUR%                  U5        gU[        R@                  :X  a(  [        RB                  " U5      nUR%                  U5        gU[        RD                  :X  a1  [        RF                  " [#        U5      5      nUR%                  U5        gU[        RH                  :X  a_  UR%                  [        RB                  " URJ                  5      5        UR%                  [        RB                  " URL                  5      5        gU[        RN                  :X  a_  UR%                  [        R>                  " URJ                  5      5        UR%                  [        R>                  " URL                  5      5        g[        U[        RP                  [        RR                  45      (       aC  UR%                  [        RT                  " UR9                  [6        RV                  5      5      5        g[        U[        RX                  5      (       al  [        U5      R                  XC5      nURZ                  n[        R                   (       a  [        R                  " [#        U5      5      nUR%                  U5        g[        U[        R\                  5      (       aD  [_        U5      [_        U5      :X  d   e[a        X5       H  u  nnU Rc                  UUX4U5        M     g[        U[        Rd                  5      (       a*   U Rc                  UR                  URf                  X4U5        g[i        X5      e! [h         a    [i        X5      ef = f)z6
Convert arguments to ctypes and append to kernelargs
)r   r  r   zc_%sN)5reversedrD   prepare_argsr   r   Arrayr   	to_devicer   	c_ssize_tc_void_psizedtypeitemsizer   device_pointerr   intrX   rangendimshapestridesIntegergetattrfloat16c_uint16npviewuint16float64c_doublefloat32c_floatbooleanc_uint8	complex64realimag
complex128
NPDatetimeNPTimedeltac_int64int64Recorddevice_ctypes_pointer	BaseTuplelenr   r   
EnumMemberr   NotImplementedError)re   tyr   r   r  r  	extensiondevaryc_intpmeminfoparentnitemsr  ptrdataaxcvaldevrecr  r  s                       rv   r   _Kernel._prepare_argsu  sA    "$//2I,,	 - GB 3 b%++&&c],,T:F%%Fooa(G__Q'FFKK(Ffll334H''/C$$#h??3'Dg&f%f%h'd#FKK(!!&b)9":; )FKK(!!&);"<= ) EMM**66B;/4Dd#5== ??2::c?#7#7		#BCDd#5== ??3'Dd#5== >>#&Dd#5== >>#c(+Dd#5??"fnnSXX67fnnSXX675###foochh78foochh78U--u/@/@ABBfnnSXXbhh-?@AELL))c],,T:F..C$$ooc#h/c"EOO,,r7c#h&&&B1""1azB % E,,--3""HHciiz &b..	 ' 3)"223s   (X X)r_   rb   r^   rC   r`   rO   r6   r[   rA   ra   rD   rM   rc   r7   r@   rB   rd   r\   rG   )	NFFFFNNTFry   )r   r   r   ) __name__
__module____qualname____firstlineno____doc__r   r?   propertyrL   r]   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   __static_attributes____classcell__ru   s   @rv   r/   r/   (   s1   
 ;@JN6;Z Zx ! ! % %- - - * *  *N' 9 9 : : ; ; ? ? : :040,#"(,FP\/ \/r|   r/   c                   &    \ rS rSrS rS rS rSrg)ForAlli  c                 h    US:  a  [        SU-  5      eXl        X l        X0l        X@l        XPl        g )Nr   z0Can't create ForAll with negative task count: %s)r   
dispatcherntasksthread_per_blockr   r   )re   r[  r\  tpbr   r   s         rv   r?   ForAll.__init__  s;    A:O%& ' '$ #"r|   c                 .   U R                   S:X  a  g U R                  R                  (       a  U R                  nOU R                  R                  " U6 nU R	                  U5      nU R                   U-   S-
  U-  nX$X0R
                  U R                  4   " U6 $ )Nr   r   )r\  r[  specialized
specialize_compute_thread_per_blockr   r   )re   r   ra  r   r   s        rv   __call__ForAll.__call__  s    ;;!??&&//K//44d;K11+>;;)A-(:Hkk>>* +,02 	2r|   c                    U R                   nUS:w  a  U$ [        5       n[        [        UR                  R                  5       5      5      n[        UR                  R                  5       SU R                  SS9nUR                  " S0 UD6u  pbU$ )Nr   i   )funcb2d_funcmemsizeblocksizelimitr   )r]  r   nextiter	overloadsvaluesr   r_   r   r   get_max_potential_block_size)re   r[  r^  r   ro   kwargs_s          rv   rc   ForAll._compute_thread_per_block  s    ##!8J -C $z33::<=>F((335#	F 55??FAJr|   )r[  r\  r   r   r]  N)rN  rO  rP  rQ  r?   rd  rc  rU  r   r|   rv   rY  rY    s    #2r|   rY  c                        \ rS rSrS rS rSrg)_LaunchConfigurationi  c                     Xl         X l        X0l        X@l        XPl        [
        R                  (       a4  SnUS   US   -  US   -  nXv:  a  SU S3n[        [        U5      5        g g g )N   r   r      z
Grid size zB will likely result in GPU under-utilization due to low occupancy.)	r[  r   r   r   r   r   CUDA_LOW_OCCUPANCY_WARNINGSr   r   )	re   r[  r   r   r   r   min_grid_size	grid_sizemsgs	            rv   r?   _LaunchConfiguration.__init__  sy    $ "--  M
WQZ/'!*<I(#I; /A A,S12 ) .r|   c                     U R                   R                  XR                  U R                  U R                  U R
                  5      $ ry   )r[  callr   r   r   r   re   r   s     rv   rd  _LaunchConfiguration.__call__  s4    ##D,,$(KKA 	Ar|   )r   r[  r   r   r   N)rN  rO  rP  rQ  r?   rd  rU  r   r|   rv   rt  rt    s    3.Ar|   rt  c                   &    \ rS rSrS rS rS rSrg)CUDACacheImpli  c                 "    UR                  5       $ ry   )r   )re   ro   s     rv   r   CUDACacheImpl.reduce   s    $$&&r|   c                 .    [         R                  " S0 UD6$ )Nr   )r/   r   )re   rG   payloads      rv   rebuildCUDACacheImpl.rebuild#  s    *'**r|   c                     g)NTr   )re   ri   s     rv   check_cachableCUDACacheImpl.check_cachable&  s     r|   r   N)rN  rO  rP  rQ  r   r  r  rU  r   r|   rv   r  r    s    '+r|   r  c                   0   ^  \ rS rSrSr\rU 4S jrSrU =r	$ )	CUDACachei1  zK
Implements a cache that saves and loads CUDA kernels and compile results.
c                 t   > SSK Jn  U" S5         [        TU ]  X5      sS S S 5        $ ! , (       d  f       g = f)Nr   )target_overrider   )numba.core.target_extensionr  r>   load_overload)re   sigrG   r  ru   s       rv   r  CUDACache.load_overload7  s*     	@V$7(= %$$s   )
7r   )
rN  rO  rP  rQ  rR  r  _impl_classr  rU  rV  rW  s   @rv   r  r  1  s      K> >r|   r  c                   d  ^  \ rS rSrSrSr\r\4U 4S jjr	\
S 5       rS r\R                  " SS9S$S	 j5       rS
 rS%S jr\
S 5       rS rS rS rS rS r\
S 5       rS&S jrS&S jrS&S jrS&S jrS&S jrS rS&S jrS r S r!S&S jr"S&S jr#S&S jr$S&S jr%S&S  jr&\'S! 5       r(S" r)S#r*U =r+$ )'CUDADispatcheri@  az  
CUDA Dispatcher object. When configured and called, the dispatcher will
specialize itself for the given arguments (if no suitable specialized
version already exists) & compute capability, and launch on the device
associated with the current context.

Dispatcher objects are not to be constructed by the user, but instead are
created using the :func:`numba.cuda.jit` decorator.
Fc                 >   > [         TU ]  XUS9  SU l        0 U l        g )N)targetoptionspipeline_classF)r>   r?   _specializedspecializations)re   rB   r  r  ru   s       rv   r?   CUDADispatcher.__init__R  s.    (6 	 	8 "  "r|   c                 .    [         R                  " U 5      $ ry   )
cuda_typesr  rz   s    rv   _numba_type_CUDADispatcher._numba_type_b  s    ((..r|   c                 8    [        U R                  5      U l        g ry   )r  rB   _cacherz   s    rv   enable_cachingCUDADispatcher.enable_cachingf  s    -r|   rv  )maxsizec                 6    [        X5      u  p[        XX#U5      $ ry   )r   rt  )re   r   r   r   r   s        rv   	configureCUDADispatcher.configurei  s    7J#D8YOOr|   c                 T    [        U5      S;  a  [        S5      eU R                  " U6 $ )N)rw  r2      z.must specify at least the griddim and blockdim)r=  r   r  r  s     rv   __getitem__CUDADispatcher.__getitem__n  s)    t9I%MNN~~t$$r|   c                     [        XX#US9$ )a  Returns a 1D-configured dispatcher for a given number of tasks.

This assumes that:

- the kernel maps the Global Thread ID ``cuda.grid(1)`` to tasks on a
  1-1 basis.
- the kernel checks that the Global Thread ID is upper-bounded by
  ``ntasks``, and does nothing if it is not.

:param ntasks: The number of tasks.
:param tpb: The size of a block. An appropriate value is chosen if this
            parameter is not supplied.
:param stream: The stream on which the configured dispatcher will be
               launched.
:param sharedmem: The number of bytes of dynamic shared memory required
                  by the kernel.
:return: A configured dispatcher, ready to launch on a set of
         arguments.)r^  r   r   )rY  )re   r\  r^  r   r   s        rv   forallCUDADispatcher.foralls  s    ( diPPr|   c                 8    U R                   R                  S5      $ )a  
A list of objects that must have a `prepare_args` function. When a
specialized kernel is called, each argument will be passed through
to the `prepare_args` (from the last object in this list to the
first). The arguments to `prepare_args` are:

- `ty` the numba type of the argument
- `val` the argument value itself
- `stream` the CUDA stream used for the current call to the kernel
- `retr` a list of zero-arg functions that you may want to append
  post-call cleanup work to.

The `prepare_args` function must return a tuple `(ty, val)`, which
will be passed in turn to the next right-most `extension`. After all
the extensions have been called, the resulting `(ty, val)` will be
passed into Numba's default argument marshalling logic.
rD   )r  getrz   s    rv   rD   CUDADispatcher.extensions  s    & !!%%l33r|   c                      [        [        5      ery   )r   r   )re   r   rp  s      rv   rd  CUDADispatcher.__call__  s    233r|   c                     U R                   (       a-  [        [        U R                  R	                  5       5      5      nO![
        R                  R                  " U /UQ76 nUR                  XX4U5        g)z:
Compile if necessary and invoke this kernel with *args*.
N)	ra  rk  rl  rm  rn  r   r   
_cuda_callr  )re   r   r   r   r   r   ro   s          rv   r~  CUDADispatcher.call  sT     $t~~44678F ++66tCdCFdXyAr|   c                     U(       a   eU Vs/ s H  o0R                  U5      PM     nnU R                  [        U5      5      $ s  snf ry   )typeof_pyvalcompiler   )re   r   kwsarC   s        rv   _compile_for_args CUDADispatcher._compile_for_args  s>    w267$Q%%a($7||E(O,, 8s   Ac                      [        U[        R                  5      $ ! [        [        4 aL    [
        R                  " U5      (       a/  [        [
        R                  " USS9[        R                  5      s $ e f = f)NF)sync)r   r   argumentr   r   r   is_cuda_arrayas_cuda_array)re   r   s     rv   r  CUDADispatcher.typeof_pyval  sk    		#w//00, 	!!#&& d005A%..0 0 	s    AA86A8c                   ^  T R                   (       a  [        S5      e[        5       R                  n[	        U 4S jU 5       5      nT R
                  R                  X#45      nU(       a  U$ T R                  n[        T R                  US9nUR                  U5        UR                  5         SUl        UT R
                  X#4'   U$ )zL
Create a new instance of this dispatcher specialized for the given
*args*.
zDispatcher already specializedc              3   F   >#    U  H  nTR                  U5      v   M     g 7fry   )r  ).0r  re   s     rv   	<genexpr>,CUDADispatcher.specialize.<locals>.<genexpr>  s     <t!**1--ts   !)r  T)ra  r=   r   rE   r   r  r  r  r  rB   r  disable_compiler  )re   r   r:   rC   specializationr  s   `     rv   rb  CUDADispatcher.specialize  s    
 ?@@!44<t<<--112.A!!**'6CEx(&&(&*#-;R\*r|   c                     U R                   $ )z.
True if the Dispatcher has been specialized.
)r  rz   s    rv   ra  CUDADispatcher.specialized  s    
    r|   c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the number of registers used by each thread in this kernel for
the device in the current context.

:param signature: The signature of the compiled kernel to get register
                  usage for. This may be omitted for a specialized
                  kernel.
:return: The number of registers used by the compiled variant of the
         kernel for the given signature and current device.
)rm  r   r   ra  rk  rl  rn  itemsre   r\   r  overloads       rv   get_regs_per_thread"CUDADispatcher.get_regs_per_thread  s      >>)..1AAAT^^22456FFF *.)=)=)?A)? 111)?A A A   B'c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the size in bytes of constant memory used by this kernel for
the device in the current context.

:param signature: The signature of the compiled kernel to get constant
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The size in bytes of constant memory allocated by the
         compiled variant of the kernel for the given signature and
         current device.
)rm  r   r   ra  rk  rl  rn  r  r  s       rv   get_const_mem_size!CUDADispatcher.get_const_mem_size  s      >>)..1@@@T^^22456EEE *.)=)=)?A)? 000)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the size in bytes of statically allocated shared memory
for this kernel.

:param signature: The signature of the compiled kernel to get shared
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The amount of shared memory allocated by the compiled variant
         of the kernel for the given signature and current device.
)rm  r   r   ra  rk  rl  rn  r  r  s       rv   get_shared_mem_per_block'CUDADispatcher.get_shared_mem_per_block        >>)..1FFFT^^22456KKK *.)=)=)?A)? 666)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )a  
Returns the maximum allowable number of threads per block
for this kernel. Exceeding this threshold will result in
the kernel failing to launch.

:param signature: The signature of the compiled kernel to get the max
                  threads per block for. This may be omitted for a
                  specialized kernel.
:return: The maximum allowable threads per block for the compiled
         variant of the kernel for the given signature and current
         device.
)rm  r   r   ra  rk  rl  rn  r  r  s       rv   get_max_threads_per_block(CUDADispatcher.get_max_threads_per_block  s      >>)..1GGGT^^22456LLL *.)=)=)?A)? 777)?A A Ar  c                 Z   Ub#  U R                   UR                     R                  $ U R                  (       a6  [	        [        U R                   R                  5       5      5      R                  $ U R                   R                  5        VVs0 s H  u  p#X#R                  _M     snn$ s  snnf )ay  
Returns the size in bytes of local memory per thread
for this kernel.

:param signature: The signature of the compiled kernel to get local
                  memory usage for. This may be omitted for a
                  specialized kernel.
:return: The amount of local memory allocated by the compiled variant
         of the kernel for the given signature and current device.
)rm  r   r   ra  rk  rl  rn  r  r  s       rv   get_local_mem_per_thread'CUDADispatcher.get_local_mem_per_thread/  r  r  c                 ,   U R                   (       a  U R                  [        U5      5        U R                  R                  nSR                  U5      n[        R                  " XCU R                  S9n[        R                  " U R                  5      nXVX4$ )z
Get a typing.ConcreteTemplate for this dispatcher and the given
*args* and *kws* types.  This allows resolution of the return type.

A (template, pysig, args, kws) tuple is returned.
zCallTemplate({0}))key
signatures)_can_compilecompile_devicer   rB   rN  formatr   make_concrete_templatenopython_signaturesr   pysignature)re   r   r  	func_namerZ   call_templatepysigs          rv   get_call_template CUDADispatcher.get_call_templateB  s{     d, LL))	")))455D,D,DF!!$,,/T..r|   c                    XR                   ;  Ga"  U R                     U R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      nU R                  R                  S5      (       a  SOSUS.n[	        5       R
                  n[        U R                  X!UUUUUUS	9	n	XR                   U'   U	R                  R                  U	R                  U	R                  U	R                  /5        S
S
S
5        U	$ U R                   U   n	U	$ ! , (       d  f       W	$ = f)zCompile the device function for the given argument types.

Each signature is compiled once by caching the compiled function inside
this object.

Returns the `CompileResult`.
r6   r7   r8   r3   r4   r2   r   )r4   r3   r5   N)rm  _compiling_counterr  r  r   rE   r   rB   rG   insert_user_functionrA   rM   rL   )
re   r   return_typer6   r7   r8   r3   r9   r:   ri   s
             rv   r  CUDADispatcher.compile_device]  s6    ~~%((**..w7--11*=++//9--11*= !% 2 2 6 6u = =11 ( 
 ()<<#DLL+*/-5+1-51=')+ (,t$##889I9I9=:>,,I- )8  >>$'D9 )(8 s   DE
Ec                     U Vs/ s H  o3R                   PM     nnU R                  XASS9  XR                  U'   g s  snf )NTr   )_code_insertrm  )re   ro   rC   r  c_sigs        rv   add_overloadCUDADispatcher.add_overload  s9    "*+(Q(+U.#)x  ,s   ;c                    [         R                  " U5      u  p#Ub  U[        R                  :X  d   eU R                  (       a,  [        [        U R                  R                  5       5      5      $ U R                  R                  U5      nUb  U$ U R                  R                  XR                  5      nUb  U R                  U==   S-  ss'   OU R                  U==   S-  ss'   U R                  (       d  [!        S5      e[#        U R$                  U40 U R&                  D6nUR)                  5         U R                  R+                  X5        U R-                  XB5        U$ )zg
Compile and bind to the current context a version of this kernel
specialized for the given signature.
r   zCompilation disabled)r   normalize_signaturer   nonera  rk  rl  rm  rn  r  r  r  	targetctx_cache_hits_cache_missesr  r=   r/   rB   r  r   save_overloadr  )re   r  rC   r  ro   s        rv   r  CUDADispatcher.compile  s!   
 !) < <S A"kUZZ&??? T^^224566^^''1F! **3?S!Q&! s#q(#$$"#9::T\\8Jt7I7IJFKKMKK%%c2&+r|   c                    U R                   R                  S5      nUbK  U(       a'  U R                  U   R                  R	                  5       $ U R                  U   R                  5       $ U(       aG  U R                  R                  5        VVs0 s H  u  p4X4R                  R	                  5       _M!     snn$ U R                  R                  5        VVs0 s H  u  p4X4R                  5       _M     snn$ s  snnf s  snnf )z
Return the LLVM IR for this kernel.

:param signature: A tuple of argument types.
:return: The LLVM IR for the given signature, or a dict of LLVM IR
         for all previously-encountered signatures.

rh   )r  r  rm  rL   r   r   r  )re   r\   rh   r  r  s        rv   r   CUDADispatcher.inspect_llvm  s     ##''1 ~~i088EEGG~~i0==??-1^^-A-A-CE-CMC --::<<-CE E .2^^-A-A-CE-CMC 2244-CE EEEs   &C5C;c                 2   [        5       R                  nU R                  R                  S5      nUbM  U(       a(  U R                  U   R
                  R                  U5      $ U R                  U   R                  U5      $ U(       aH  U R                  R                  5        VVs0 s H   u  pEXER
                  R                  U5      _M"     snn$ U R                  R                  5        VVs0 s H  u  pEXER                  U5      _M     snn$ s  snnf s  snnf )z
Return this kernel's PTX assembly code for for the device in the
current context.

:param signature: A tuple of argument types.
:return: The PTX code for the given signature, or a dict of PTX codes
         for all previously-encountered signatures.
rh   )	r   rE   r  r  rm  rL   rN   r   r  )re   r\   r:   rh   r  r  s         rv   r   CUDADispatcher.inspect_asm  s     !44##''1 ~~i088DDRHH~~i0<<R@@-1^^-A-A-CE-CMC --99"==-CE E .2^^-A-A-CE-CMC 11"55-CE EEEs   $'D,Dc                    U R                   R                  S5      (       a  [        S5      eUb  U R                  U   R	                  5       $ U R                  R                  5        VVs0 s H  u  p#X#R	                  5       _M     snn$ s  snnf )aK  
Return this kernel's CFG for the device in the current context.

:param signature: A tuple of argument types.
:return: The CFG for the given signature, or a dict of CFGs
         for all previously-encountered signatures.

The CFG for the device in the current context is returned.

Requires nvdisasm to be available on the PATH.
rh   z'Cannot get the CFG of a device function)r  r  r=   rm  r   r  re   r\   r  defns       rv   r   CUDADispatcher.inspect_sass_cfg  s     !!(++HII >>),==?? &*^^%9%9%;=%;	 ..00%;= = =   )B	c                    U R                   R                  S5      (       a  [        S5      eUb  U R                  U   R	                  5       $ U R                  R                  5        VVs0 s H  u  p#X#R	                  5       _M     snn$ s  snnf )ag  
Return this kernel's SASS assembly code for for the device in the
current context.

:param signature: A tuple of argument types.
:return: The SASS code for the given signature, or a dict of SASS codes
         for all previously-encountered signatures.

SASS for the device in the current context is returned.

Requires nvdisasm to be available on the PATH.
rh   z(Cannot inspect SASS of a device function)r  r  r=   rm  r   r  r	  s       rv   r   CUDADispatcher.inspect_sass  s     !!(++IJJ >>),99;; &*^^%9%9%;=%;	 **,,%;= = =r  c                     Uc  [         R                  nU R                  R                  5        H  u  p#UR	                  US9  M     g)r   Nr   )r   r   rm  r  r   )re   r   rq  r
  s       rv   r   CUDADispatcher.inspect_types  s<     <::D~~++-GAD) .r|   c                     U " X5      nU$ )r   r   )r   rB   r  r   s       rv   r   CUDADispatcher._rebuild  s    
 w.r|   c                 >    [        U R                  U R                  S9$ )zL
Reduce the instance for serialization.
Compiled definitions are discarded.
)rB   r  )r   rB   r  rz   s    rv   r   CUDADispatcher._reduce_states  s     
 DLL"&"4"46 	6r|   )r  r  r  rM  )r   r   r   ry   ),rN  rO  rP  rQ  rR  
_fold_argsr   targetdescrr   r?   rS  r  r  r   	lru_cacher  r  r  rD   rd  r~  r  r  rb  ra  r  r  r  r  r  r  r  r  r  r   r   r   r   r   rT  r   r   rU  rV  rW  s   @rv   r  r  @  s    JK>J "  / /. %P &P%
Q, 4 4(4	B-0 ! !A&A(A&A*A&/6%N*
"HE.E0=*=,
*  6 6r|   r  ):numpyr)  rR   r   r   r   
numba.corer   r   r   r   r   r   numba.core.cachingr	   r
   numba.core.compiler_lockr   numba.core.dispatcherr   numba.core.errorsr   r   numba.core.typing.typeofr   r   numba.cuda.apir   numba.cuda.argsr   numba.cuda.compilerr   r   numba.cuda.cudadrvr   numba.cuda.cudadrv.devicesr   numba.cuda.descriptorr   numba.cuda.errorsr   r   
numba.cudar  numbar   r   warningsr   rQ   ReduceMixinr/   objectrY  rt  r  r  r  r   r|   rv   <module>r+     s     	 
   H H / 9 , F 4 - $ : % 2 -< *   * i/i## i/X+V +\A A:I $> >a6Z!6!6 a6r|   