
    ȅijM                        S SK r S SKrS SKJr  S SKrS SKJr  S\ R                  4S jr	S\ R                  4S jr
S\ R                  4S jrS\SS4S	 jrS\ R                  4S
 jrS\ R                  4S jrS\ R                  4S jrS\\   4S jr    S S\S\S\S-  S\S-  S\S-  S\S\\\4   4S jjr " S S5      r " S S5      r S!S\\-  S\\   S-  S\\\S4   -  4S jjr S"S\S\S\S\4S jjrg)#    N)Any)_get_device_indexreturnc                  
    SS K n [        R                  " [        U R	                  S5      S   5      5      nUR                  Ul        UR                  Ul        UR                   Ul        UR$                  Ul        UR(                  Ul        U$ ! [
        [        4 ad    [        R                  S:X  a7  [        R                  " S[        R                  R                  S    S35      n N[        R                  " S5      n Nf = f)Nr   amdhip64win32	amdhip64_.dllzlibamdhip64.so)rocm_sdkctypesCDLLstrfind_librariesImportError
IndexErrorsysplatformtorchversionhiphipGetErrorStringcuGetErrorStringhipModuleLoadDatacuModuleLoadDatahipModuleGetFunctioncuModuleGetFunctionhipModuleLaunchKernelcuLaunchKernelhipFuncSetAttributecuFuncSetAttribute)r   libs     K/home/james-whalen/.local/lib/python3.13/site-packages/torch/cuda/_utils.py_get_hip_runtime_libraryr#      s    	0kk#h55jA!DEF 00C00C!66C22C 44CJ $ 0<<7"++	%--*;*;A*>)?tDEC++./C	0s   5B AD)DDc                      [         R                  S:X  a  [        R                  " S5      $ [        R                  " S5      $ )Nr   z
nvcuda.dllzlibcuda.so.1)r   r   r   r        r"   _get_cuda_libraryr'   "   s,    
||w{{<(({{>**r&   c                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ N)r   r   r   r#   r'   r%   r&   r"   _get_gpu_runtime_libraryr*   *   s!    }}')) ""r&   resultc                    U S:X  a  g [         R                  " 5       n[        5       nUR                  U [         R                  " U5      5        UR
                  b  UR
                  R                  5       OSn[        SU 35      e)Nr   Unknown CUDA errorCUDA error: )r   c_char_pr*   r   byrefvaluedecodeRuntimeError)r+   err_strlibcudaerror_messages       r"   _check_cudar7   2   sn    {ooG&(GVV\\'%:;")--";AU  m_5
66r&   c                      SS K n [        R                  " [        U R	                  S5      S   5      5      nUR                  Ul        UR                  Ul        UR"                  Ul        UR&                  Ul        UR*                  Ul        UR.                  Ul        UR2                  Ul        UR6                  Ul        UR:                  Ul        UR>                  Ul         U$ ! [
        [        4 a    [        R                  S:X  ah  SR                  S[        R                  R                  S   S[        R                  R                  S   /5      n[        R                  " SU S35      n GN8[        R                  " S5      n GNQf = f)	Nr   hiprtcr    0   r
   zlibhiprtc.so)!r   r   r   r   r   r   r   r   r   joinr   r   r   hiprtcGetErrorStringnvrtcGetErrorStringhiprtcCreateProgramnvrtcCreateProgramhiprtcDestroyProgramnvrtcDestroyProgramhiprtcCompileProgramnvrtcCompileProgramhiprtcGetCodeSizenvrtcGetPTXSizehiprtcGetCodenvrtcGetPTXhiprtcGetProgramLogSizenvrtcGetProgramLogSizehiprtcGetProgramLognvrtcGetProgramLoghiprtcAddNameExpressionnvrtcAddNameExpressionhiprtcGetLoweredNamenvrtcGetLoweredName)r   r!   version_strs      r"   _get_hiprtc_libraryrS   >   s@   .kk#h55h?BCD "66C 44C!66C!66C//C''CO!$!<!<C 44C!$!<!<C!66CJ) $ .<<7"''emm''*C1B1B11EFK ++{m489C++n-C.s   5C# #B	F	/F	F	c                  *   [        [        R                  R                  R	                  S5      S   5      n [
        R                  S:X  a  SU  S3/nOSU  3S/nU H  n [        R                  " U5      s  $    [        S5      e! [         a     M6  f = f)	N.r   r   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
intr   r   cudasplitr   r   r   r   OSError)major_version
nvrtc_libslib_names      r"   _get_nvrtc_libraryr^   [   s    **005a89M
||w}oW-


 =/*

 	;;x(( 
 4
55  		s   B
BBc                  h    [         R                  R                  (       a
  [        5       $ [	        5       $ r)   )r   r   r   rS   r^   r%   r&   r"   _get_gpu_rtc_libraryr`   n   s#     }}"$$!##r&   c                      SSK Jn Jn  S1nU Vs/ s H  o3U;  d  M
  UPM     nn[        R                  R
                  (       a  UR                  U 5        U$ s  snf )z
Get HIPCC/NVCC flags that are compatible with NVRTC compilation.

Returns:
    List of HIPCC/NVCC flags that can be safely used with NVRTC.
r   )COMMON_HIPCC_FLAGSCOMMON_NVCC_FLAGSz--expt-relaxed-constexpr)torch.utils.cpp_extensionrb   rc   r   r   r   extend)rb   rc   nvrtc_unsupported_flagsflagcompatible_flagss        r"   _get_gpu_rtc_compatible_flagsri   w   sc     P 	# +*:Q.Q*   }} 23s
   	AAkernel_sourcekernel_namecompute_capabilitycuda_include_dirsnvcc_optionsauto_pchc           
      ~	  ^^ SSK n[        5       mSmS[        SS4UU4S jjnU R                  S5      nUcv  UR                  R                  UR                  R                  5       5      n	UR                  R                  (       a  U	R                   nOU	R                   U	R                   3n/ n
UR                  R                  (       a#  U
R                  SU 3R                  5       5        O"U
R                  SU 3R                  5       5        SS	KJn  U" S
5      nU H%  nU
R                  SU 3R                  5       5        M'     U(       a+  U H%  nU
R                  SU 3R                  5       5        M'     U(       a@  [        UR                  R                  5      S:  d   S5       eUc  / nUR                  S5        U(       a)  U H#  nU
R                  UR                  S5      5        M%     [!        5       nU
R#                  U Vs/ s H  nUR                  S5      PM     sn5        [%        U
5      n[&        R(                  U-  " U
6 n[&        R*                  " 5       nU" TR-                  [&        R.                  " U5      UU S3R                  5       SSS5      5        UR                  S5      nU" TR1                  UU5      5        TR3                  UUU5      nUT:w  a  [&        R4                  " 5       nTR7                  U[&        R.                  " U5      5        [&        R8                  " UR:                  5      nTR=                  UU5        [?        SUR:                  RA                  5        35      e[&        R4                  " 5       nU" TRC                  U[&        R.                  " U5      5      5        [&        R8                  " UR:                  5      nU" TRE                  UU5      5        [&        R(                  " 5       nU" TRG                  UU[&        R.                  " U5      5      5        UR:                  b  UR:                  RA                  5       nOSnTRI                  [&        R.                  " U5      5        UR                  R                  (       a  URJ                  OUR:                  nUU4$ s  snf )a  
Compiles a CUDA kernel using NVRTC and returns the PTX code.

Args:
    kernel_source (str): The CUDA kernel source code as a string
    kernel_name (str): The name of the kernel function to compile
    compute_capability (str, None): The compute capability to target (e.g., "86").
                                       If None, will detect from current device.
    cuda_include_dirs (list, None): List of directories containing CUDA headers
    nvcc_options (list, None): Additional options to pass to NVRTC
    auto_pch (bool): Enable automatic precompiled headers (CUDA 12.8+)

Returns:
    Tuple[bytes, str]: The compiled PTX code and mangled kernel name
r   Nr+   r   c                    > U T:w  ar  [         R                  " 5       nTR                  U [         R                  " U5      5        UR                  b  UR                  R                  5       OSn[        SU 35      eg )Nr-   r.   )r   r/   r?   r0   r1   r2   r3   )r+   r4   r6   NVRTC_SUCCESSlibnvrtcs      r"   check_nvrtc#_nvrtc_compile.<locals>.check_nvrtc   so    ]"oo'G((g1FG ==, $$&) 
 m_=>> #r&   utf-8z--offload-arch=z--gpu-architecture=sm_)include_pathsrX   z-Iz12.8zPCH requires CUDA 12.8+z--pchz.cuzKernel compilation failed:
r:   )&
torch.cudar`   rW   encoderX   get_device_propertiescurrent_devicer   r   gcnArchNamemajorminorappendrd   rw   r   ri   re   lenr   r/   c_void_prA   r0   rO   rE   c_size_trK   create_string_bufferr1   rM   r3   r2   rG   rI   rQ   rC   raw) rj   rk   rl   rm   rn   ro   r   rt   source_bytespropsoptionsrw   cuda_include_paths	cuda_path	directoryoptionnvrtc_compatible_flagsrg   num_optionsoptions_arrayprogc_kernel_namereslog_sizelogptx_sizeptxc_mangled_namemangled_name	ptx_bytesrr   rs   s                                  @@r"   _nvrtc_compiler      s   0  $%H M	?C 	?D 	? 	? !''0L !

001J1J1LM==$)$5$5#6$)KK=!> G}});(<=DDFG/0B/CDKKMN 8&v.'	I;'..01 ( *INNR	{+2245 + 5==%%&&0K2KK0LG$ "FNN6==12 # ;<NN5KL5KTDKK(5KLM g,K__{2W=M ??D##LLm3&&(	
	  &&w/M//mDE 
&
&t[-
HC m??$''fll8.DE))(..9##D#.9#)):J:J:L9MNOO  H((v||H/EFG

%
%hnn
5C$$T3/0 __&N$$T=&,,~:VW '%++224  d!34
 !==,,#))Il""s Ms   R:c                   L    \ rS rSrS\R
                  SS4S jrS\SS4S jrS	r	g)
_CudaModulei"  moduler   Nc                     Xl         0 U l        g r)   )_module_kernels)selfr   s     r"   __init___CudaModule.__init__#  s    02r&   name_CudaKernelc           	         XR                   ;   a  U R                   U   $ SSKJn  U" 5       n[        R                  " 5       n [        UR                  [        R                  " U5      U R                  UR                  S5      5      5        [        X@R                  5      nXPR                   U'   U$ ! [         a  n[        SU S35      UeS nAff = f)Nr   )r*   rv   zNo kernel named 'z' in this module)r   torch.cuda._utilsr*   r   r   r7   r   r0   r   ry   r   r3   AttributeError)r   r   r*   r5   funckernelerrs          r"   __getattr___CudaModule.__getattr__'  s    == ==&& 	?*, 	V++LL&dkk'6J
 !||4F"(MM$M 	V #4TF:J!KLRUU	Vs   A-B0 0
C:C

C)r   r   )
__name__
__module____qualname____firstlineno__r   r   r   r   r   __static_attributes__r%   r&   r"   r   r   "  s/    3v 34 3V V Vr&   r   c                       \ rS rSrSrS\R                  S\R                  SS4S jr     SS\\	\	\	4   S	\\	\	\	4   S
\
S-  S\	S\S-  SS4S jjrS\	SS4S jrSrg)r   i@  zL
Represents a compiled CUDA kernel that can be called with PyTorch tensors.
r   r   r   Nc                 *    Xl         X l        SU l        g )Nr   )r   r   _max_shared_mem_bytes)r   r   r   s      r"   r   _CudaKernel.__init__E  s    	%&"r&   gridblockargs
shared_memstreamc                    SSK nUR                  R                  R                  5       nU(       d  / n/ n/ n	U GHv  n
[	        XR
                  5      (       a  U
R                  (       d1  U
R                  (       a  U
R                  5       (       d  [        S5      e[        R                  " U
R                  5       5      nUR                  U5        U	R                  [        R                  " U5      5        M  [	        U
[        5      (       a>  [        R                   " U
5      nU	R                  [        R                  " U5      5        GM  [	        U
["        5      (       a>  [        R$                  " U
5      nU	R                  [        R                  " U5      5        GMb  ['        S[)        U
5       35      e   [        R                  [+        U	5      -  " 5       n[-        U	5       H,  u  p[        R.                  " U
[        R                  5      X'   M.     Uc  SSKnUR                  R3                  5       nUS:  aS  U R4                  S:X  d  X@R4                  :  a4  U R4                  S:X  a  SOSU R4                   S3n[7        S	U S
U S35      e[9        UR;                  U R<                  US   US   US   US   US   US   UUR>                  US5      5        g)a  
Call the compiled CUDA kernel

Args:
    grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
    block (tuple): Block dimensions (block_x, block_y, block_z)
    args (list): List of arguments to pass to the kernel.
                 PyTorch tensor arguments will be automatically converted to pointers.
    shared_mem (int): Shared memory size in bytes
    stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:    znot configuredzonly z bytes configuredzKernel requires z' bytes of shared memory (>= 48KB), but ze. Call kernel.set_shared_memory_config(shared_mem) after compilation and before launching the kernel.   r<   ) r   rX   _utilsr*   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr   r   data_ptrr   r0   rW   c_intfloatc_double	TypeErrortyper   	enumeratecastrx   current_streamr   r3   r7   r   r   _as_parameter_)r   r   r   r   r   r   r   r5   processed_argsc_argsargptrr   r   c_args_arrayiconfigured_msgs                    r"   __call___CudaKernel.__call__J  sT   & 	**##<<>D 13C#||,,{{CJJ3==??$Y  ooclln5%%c*fll3/0C%%S)fll512C''!??3/fll845"=d3i[ IJJ+ 0 #f+58'FA$kk#v?LO ( >ZZ..0F "&&!+z<V<V/V --2 !T7788IJ 
 ":, /%& '33  	""		QQQaaa%%	
r&   shared_mem_bytesc                 z   US:  a  Xl         g [        5       n[        R                  R	                  5       n[        R
                  R                  (       a  UR                  S:w  a  SOSnO[        USS5      nX:  a  [        SU SU S35      eS	n[        UR                  U R                  UU5      5        Xl         g )
Nr   gfx950i   i  shared_memory_per_block_optinzRequested shared memory (z bytes) exceeds device limit (z= bytes). Consider reducing block size or shared memory usage.   )r   r*   r   rX   rz   r   r   r|   getattrr3   r7   r    r   )r   r   r5   device_propsmax_shared_mem+cudaFuncAttributeMaxDynamicSharedMemorySizes         r"   set_shared_memory_config$_CudaKernel.set_shared_memory_config  s    i')9&*, zz779== &11X=:  %=uN ,+,<+= >!!/ 0 1GG  783&&		; 	
 &6"r&   )r   r   r   )r   r   r   r   Nr   N)r   r   r   r   __doc__r   r   r   tuplerW   listr   r   r   r   r%   r&   r"   r   r   @  s    'V__ 'foo '$ ' &/&/ !_
CcM"_
 S#s]#_
 Tk	_

 _
 d
_
 
_
B(6 (6 (6r&   r   r   kernel_namesc           
      \   SSK n[        5       n[        U [        5      (       a  U R	                  S5      n [
        R                  " 5       nUR                  R                  5       nU   [        UR                  [
        R                  " U5      U 5      5        SSS5        U(       d  [        U5      $ 0 nU Hc  n[
        R                  " 5       n[        UR                  [
        R                  " U5      XGR	                  S5      5      5        [        X5      Xg'   Me     U$ ! , (       d  f       N= f)a  
Loads a CUDA module from PTX code and returns a module object that can access kernels.

Args:
    ptx (bytes or str): The PTX code to load
    kernel_names (list, optional): List of kernel names to extract from the module.
                                  If None, will return a module object with __getattr__.

Returns:
    object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
           If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
r   Nrv   )rx   r*   r   r   ry   r   r   rX   r   r7   r   r0   r   r   r   )	r   r   r   r5   r   r   kernelsr   r   s	            r"   _cuda_load_moduler     s       '(G #sjj! __FZZ&&(F	G,,V\\&-A3GH 
 6"" G ''T"FKK,@	

 $D1  N! 
s   &0D
D+deviceoptional	allow_cpuc                    [        U [        5      (       a  U $ [        U [        5      (       a  [        R                  " U 5      n [        U [        R                  5      (       aD  U(       a  U R
                  S;  a  [        SU  35      eOU R
                  S:w  a  [        SU  35      e[        R                  R                  5       (       d5  [        U [        R                  R                  5      (       a  U R                  $ [        XU5      $ )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

If :attr:`device` is a torch.device object, returns the device index if it
is a CUDA device. Note that for a CUDA device without a specified index,
i.e., ``torch.device('cuda')``, this will return the current default CUDA
device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
CPU devices will be accepted and ``-1`` will be returned in this case.

If :attr:`device` is a Python integer, it is returned as is.

If :attr:`device` is ``None``, this will return the current default CUDA
device if :attr:`optional` is ``True``.
)rX   cpuz(Expected a cuda or cpu device, but got: rX   z!Expected a cuda device, but got: )r   rW   r   r   r   r   r   jitis_scriptingrX   idx_torch_get_device_index)r   r   r   s      r"   r   r     s      &#&#f%&%,,''{{/1 #KF8!TUU 2[[F"@IJJ99!!##fejj//00::"6Y??r&   )NNNFr)   )FF)r   r   typingr   r   torch._utilsr   r   r   r#   r'   r*   rW   r7   rS   r^   r`   r   r   ri   boolr   bytesr   r   r   dictr   r%   r&   r"   <module>r      s    
   F&++ .+6;; +#&++ #	7 	7 	7V[[ :6FKK 6&$fkk $tCy 6 &*%) $P#P#P# d
P# d{	P#
 +P# P# 5#:P#fV V<S6 S6n 8<-	u-$(I$4-4]*++-b <A@@@48@@r&   