
    -jiA                    X   S r SSKJr  SSKrSSKJr  SSKJr  SSKrSSK	J
r
  SSKJr  SSKJr  \(       a  SS	KJr  SSKrSSKrOSS
KJr  \" S5      r\" S5      r\" \5      rSS jr " S S\R0                  R2                  5      r " S S5      r  S                 SS jjrg)a  Notations in this Gaussian process implementation

X_train: Observed parameter values with the shape of (len(trials), len(params)).
y_train: Observed objective values with the shape of (len(trials), ).
x: (Possibly batched) parameter value(s) to evaluate with the shape of (..., len(params)).
cov_fX_fX: Kernel matrix X = V[f(X)] with the shape of (len(trials), len(trials)).
cov_fx_fX: Kernel matrix Cov[f(x), f(X)] with the shape of (..., len(trials)).
cov_fx_fx: Kernel scalar value x = V[f(x)]. This value is constant for the Matern 5/2 kernel.
cov_Y_Y_inv:
    The inverse of the covariance matrix (V[f(X) + noise_var])^-1 with the shape of
    (len(trials), len(trials)).
cov_Y_Y_inv_Y: `cov_Y_Y_inv @ y` with the shape of (len(trials), ).
max_Y: The maximum of Y (Note that we transform the objective values such that it is maximized.)
sqd: The squared differences of each dimension between two points.
is_categorical:
    A boolean array with the shape of (len(params), ). If is_categorical[i] is True, the i-th
    parameter is categorical.
    )annotationsN)Any)TYPE_CHECKING)*single_blas_thread_if_scipy_v1_15_or_newer)optuna_warn)
get_logger)Callable)_LazyImportscipytorchc                
   [         R                  " U 5      n[         R                  " U5      (       a  U $ [        S5        [         R                  " USS9n[         R
                  " U [         R                  " U[         R                  " [         R                  " X[         R                  5      SS9S5      [         R                  " U[         R                  " [         R                  " X[         R                  * 5      SS9S5      5      $ )NzDClip non-finite values to the min/max finite values for GP fittings.r   )axis        )
npisfiniteallr   anyclipwheremininfmax)valuesis_values_finiteis_any_finites      G/home/james-whalen/.local/lib/python3.13/site-packages/optuna/_gp/gp.pywarn_and_convert_infr   /   s    {{6*	vvVWFF+!4M 77
rxx0@"&&'QXY Z\_`
rxx0@266''RYZ []`a     c                  <    \ rS rSr\SS j5       r\SS j5       rSrg)Matern52Kernel?   c                    [         R                  " SU-  5      n[         R                  " U* 5      nUSU-  U-   S-   -  nSUS-   -  U-  nU R                  U5        U$ )a  
This method calculates `exp(-sqrt5d) * (1/3 * sqrt5d ** 2 + sqrt5d + 1)` where
`sqrt5d = sqrt(5 * squared_distance)`.

Please note that automatic differentiation by PyTorch does not work well at
`squared_distance = 0` due to zero division, so we manually save the derivative, i.e.,
`-5/6 * (1 + sqrt5d) * exp(-sqrt5d)`, for the exact derivative calculation.

Notice that the derivative of this function is taken w.r.t. d**2, but not w.r.t. d.
   g?   g)r   sqrtexpsave_for_backward)ctxsquared_distancesqrt5dexp_partvalderivs         r   forwardMatern52Kernel.forward@   sh     A 00199fW%5$44v=ABFQJ'(2e$
r   c                &    U R                   u  nX!-  $ )z
Let x be squared_distance, f(x) be forward(ctx, x), and g(f) be a provided function, then
deriv := df/dx, grad := dg/df, and deriv * grad = df/dx * dg/df = dg/dx.
)saved_tensors)r(   gradr-   s      r   backwardMatern52Kernel.backwardS   s     $$|r    N)r(   r   r)   torch.Tensorreturnr6   )r(   r   r2   r6   r7   r6   )__name__
__module____qualname____firstlineno__staticmethodr.   r3   __static_attributes__r5   r   r   r    r    ?   s(     $  r   r    c                      \ rS rSr              SS jr\SS j5       rSS jr S     SS jjrSSS jjr	SS jr
          SS	 jrS
rg)GPRegressor]   c                   Xl         X l        X0l        UR                  S5      UR                  S5      -
  R	                  5       U l        U R                   R                  5       (       aT  U R
                  SU R                   4   S:  R                  [        R                  5      U R
                  SU R                   4'   S U l
        S U l        X@l        XPl        X`l        g )N.r   )_is_categorical_X_train_y_train	unsqueezesquare__squared_X_diffr   typer   float64_cov_Y_Y_chol_cov_Y_Y_inv_Yinverse_squared_lengthscaleskernel_scale	noise_var)selfis_categoricalX_trainy_trainrN   rO   rP   s          r   __init__GPRegressor.__init__^   s      . ' 1 1" 58I8I"8M MVVX##%%$$S$*>*>%>?#Ed5==!   d&:&:!:; 3737,H)("r   c                    S[         R                  " U R                  R                  5       R	                  5       R                  5       5      -  $ )Ng      ?)r   r%   rN   detachcpunumpy)rQ   s    r   length_scalesGPRegressor.length_scalesv   s7    RWWT>>EEGKKMSSUVVVr   c           	     Z   U R                   c  U R                  b   S5       e[        R                  " 5          U R	                  5       R                  5       R                  5       R                  5       nS S S 5        W[        R                  " U R                  R                  S   5      ==   U R                  R                  5       -  ss'   [        R                  R                  U5      n[         R                  R#                  UR$                  [         R                  R#                  X R&                  R                  5       R                  5       SS9SS9n[        R(                  " U5      U l         [        R(                  " U5      U l        U R*                  R                  5       U l        S U R*                  l        U R.                  R                  5       U l        S U R.                  l        U R                  R                  5       U l        S U R                  l        g ! , (       d  f       GN= f)Nz(Cannot call cache_matrix more than once.r   T)lowerF)rL   rM   r   no_gradkernelrX   rY   rZ   r   diag_indicesrE   shaperP   itemlinalgcholeskyr   solve_triangularTrF   
from_numpyrN   r2   rO   )rQ   cov_Y_Ycov_Y_Y_cholcov_Y_Y_inv_Ys       r   _cache_matrixGPRegressor._cache_matrixz   s   !!)d.A.A.I 	
6	
I ]]_kkm**,00288:G  	 3 3A 678DNN<O<O<QQ8yy))'2 55NNLL)),8I8I8K8Q8Q8S[_)` 6 
 #--l;#..}=,0,M,M,T,T,V)15)). --446!%..0"+ _s   ;H
H*Nc                   Uc  Ub   eU R                   nOUc  U R                  nUR                  S:X  a  X-
  O"UR                  S5      UR                  S5      -
  R	                  5       nU R
                  R                  5       (       a@  USU R
                  4   S:  R                  [        R                  5      USU R
                  4'   UR                  U R                  5      n[        R                  U5      U R                  -  $ )a  
Return the kernel matrix with the shape of (..., n_A, n_B) given X1 and X2 each with the
shapes of (..., n_A, len(params)) and (..., n_B, len(params)).

If x1 and x2 have the shape of (len(params), ), kernel(x1, x2) is computed as:
    kernel_scale * Matern52Kernel.apply(
        sqd(x1, x2) @ inverse_squared_lengthscales
    )
where if x1[i] is continuous, sqd(x1, x2)[i] = (x1[i] - x2[i]) ** 2 and if x1[i] is
categorical, sqd(x1, x2)[i] = int(x1[i] != x2[i]).
Note that the distance for categorical parameters is the Hamming distance.
r$   rB   rC   .r   )rI   rE   ndimrG   rH   rD   r   rJ   r   rK   matmulrN   r    applyrO   )rQ   X1X2sqdsqdists        r   r`   GPRegressor.kernel   s     :::&&Cz]] ggl27R0@2<<PRCS0S\\^C##''))25c4;O;O6O2PSV2V1\1\MM2C---. D==>##F+d.?.???r   c           
     j   U R                   b  U R                  c   S5       eUR                  S:H  nU(       d  UOUR                  S5      n[        R
                  R                  U R                  U5      =oPR                  5      n[        R
                  R                  U R                   [        R
                  R                  U R                   R                  USSS9SSS9nU(       ab  U(       a   S5       eU R                  XD5      nXR                  UR                  SS	5      5      -
  n	U	R                  S	SS
9R                  S5        O?U R                  nU[        R
                  R                  XW5      -
  n	U	R                  S5        U(       a"  UR                  S5      U	R                  S5      4$ Xi4$ )a  
This method computes the posterior mean and variance given the points `x` where both mean
and variance tensors will have the shape of x.shape[:-1].
If ``joint=True``, the joint posterior will be computed.

The posterior mean and variance are computed as:
    mean = cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ y, and
    var = cov_fx_fx - cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ cov_fx_fX.T.

Please note that we clamp the variance to avoid negative values due to numerical errors.
z+Call cache_matrix before calling posterior.r$   r   TF)upperleftz3Call posterior with joint=False for a single point.rB   )dim1dim2r   )rL   rM   ro   rG   r   rd   vecdotr`   rf   rg   rp   	transposediagonal
clamp_min_rO   squeeze)
rQ   xjointis_single_pointx_	cov_fx_fXmeanV	cov_fx_fxvar_s
             r   	posteriorGPRegressor.posterior   sr    !!-$2E2E2Q 	
9	
Q &&A+%Q1;;q>||""B#?9ATATULL))LL))$*<*<*>*>	QU\a)b	 * 
 &](]]&B+Ixx	(;(;B(CDDDMMrM+66s;))Iu||229@@DOOC 5DQa1V4,Vr   c                0   U R                   R                  S   nSU-  [        R                  " S[        R                  -  5      -  nU R                  5       U R                  [        R                  " U[        R                  S9-  -   n[        R                  R                  U5      nUR                  5       R                  5       R                  5       * n[        R                  R                  X@R                  SS2S4   SS9SS2S4   nSXf-  -  nXR-   U-   $ )a%  
This method computes the marginal log-likelihood of the kernel hyperparameters given the
training dataset (X, y).
Assume that N = len(X) in this method.

Mathematically, the closed form is given as:
    -0.5 * log((2*pi)**N * det(C)) - 0.5 * y.T @ inv(C) @ y
    = -0.5 * log(det(C)) - 0.5 * y.T @ inv(C) @ y + const,
where C = cov_Y_Y = cov_fX_fX + noise_var * I and inv(...) is the inverse operator.

We exploit the full advantages of the Cholesky decomposition (C = L @ L.T) in this method:
    1. The determinant of a lower triangular matrix is the diagonal product, which can be
       computed with N flops where log(det(C)) = log(det(L.T @ L)) = 2 * log(det(L)).
    2. Solving linear system L @ u = y, which yields u = inv(L) @ y, costs N**2 flops.
Note that given `u = inv(L) @ y` and `inv(C) = inv(L @ L.T) = inv(L).T @ inv(L)`,
y.T @ inv(C) @ y is calculated as (inv(L) @ y) @ (inv(L) @ y).

In principle, we could invert the matrix C first, but in this case, it costs:
    1. 1/3*N**3 flops for the determinant of inv(C).
    2. 2*N**2-N flops to solve C @ alpha = y, which is alpha = inv(C) @ y.

Since the Cholesky decomposition costs 1/3*N**3 flops and the matrix inversion costs
2/3*N**3 flops, the overall cost for the former is 1/3*N**3+N**2+N flops and that for the
latter is N**3+2*N**2-N flops.
r   g         dtypeNF)rx   )rE   rb   mathlogpir`   rP   r   eyerK   rd   re   r   sumrf   rF   )rQ   n_pointsconstri   Llogdet_partinv_L_y	quad_parts           r   marginal_log_likelihood#GPRegressor.marginal_log_likelihood   s    4 ==&&q)x$((1tww;"77++-$..599XU]]3["[[LL!!'*zz|'')--//,,//==D3IQV/WXY[\X\]G-.	"Y..r   c           	     .  ^ ^^^^	 T R                   R                  S   m	[        R                  " [        R                  " T R
                  R                  5       R                  5       R                  5       5      [        R                  " T R                  R                  5       5      [        R                  " T R                  R                  5       ST-  -
  5      //5      nS
UUUU	U 4S jjn[        5          [        R                  R                  UUSSSU0S9nS S S 5        WR                   (       d  [#        SUR$                   35      e[&        R(                  " UR*                  5      n[&        R,                  " US T	 5      T l        [&        R,                  " UT	   5      T l	        T(       a#  [&        R.                  " T[&        R0                  S	9OT[&        R,                  " UT	S-      5      -   T l        T R3                  5         T $ ! , (       d  f       N= f)Nr$   gGz?c                  > [         R                  " U 5      R                  S5      n[         R                  " 5          [         R                  " US T 5      Tl        [         R                  " UT   5      Tl        T(       a#  [         R                  " T[         R                  S9O[         R                  " UTS-      5      T-   Tl	        TR                  5       * T" T5      -
  nUR                  5         UR                  TS-      nT(       a  US:X  d   eS S S 5        WR                  5       UR                  R                  5       R                  5       R!                  5       4$ ! , (       d  f       NT= f)NTr   r$   r   )r   rh   requires_grad_enable_gradr&   rN   rO   tensorrK   rP   r   r3   r2   rc   rX   rY   rZ   )	
raw_paramsraw_params_tensorlossraw_noise_var_graddeterministic_objective	log_priorminimum_noisen_paramsrQ   s	       r   	loss_func1GPRegressor._fit_kernel_params.<locals>.loss_func  s&    % 0 0 < K KD Q""$49II>OPYQY>Z4[1$)II.?.I$J! / LLemmD#4X\#BCmS 
 44664H%6%;%;HqL%I"26HA6MMM % 99; 1 6 6 = = ? C C E K K MMM %$s   CE  
E.Tzl-bfgs-bgtol)jacmethodoptionszOptimization failed: r   )r   
np.ndarrayr7   ztuple[float, np.ndarray])rE   rb   r   concatenater   rN   rX   rY   rZ   rO   rc   rP   r   r   optimizeminimizesuccessRuntimeErrormessager   rh   r   r&   r   rK   rl   )
rQ   r   r   r   r   initial_raw_paramsr   resraw_params_opt_tensorr   s
   ````     @r   _fit_kernel_paramsGPRegressor._fit_kernel_params   s    ==&&q)  ^^t88??AEEGMMOPFF4,,1134FF4>>..04-3GGH	
	N 	N" 89..))"! * C : {{!6s{{mDEE % 0 0 7,1II6KIX6V,W)!II&;H&EF ' LLemm<+@A+N!OO 	
 	- :9s   5$H
H)	rE   rL   rM   rD   rI   rF   rN   rO   rP   )rR   r6   rS   r6   rT   r6   rN   r6   rO   r6   rP   r6   r7   None)r7   r   )r7   r   )NN)rr   torch.Tensor | Noners   r   r7   r6   )F)r   r6   r   boolr7   z!tuple[torch.Tensor, torch.Tensor])r7   r6   )
r   %Callable[[GPRegressor], torch.Tensor]r   floatr   r   r   r   r7   r?   )r8   r9   r:   r;   rU   propertyr[   rl   r`   r   r   r   r=   r5   r   r   r?   r?   ]   s    #$# # 	#
 '3# ##  # 
#0 W W#8 IM@%@2E@	@<#WJ!/F@8@ @ "&	@
 @ 
@r   r?   c           
     J  ^ ^^^ [         R                  " T R                  S   S-   [         R                  S9mS	U UUU4S jjnU" 5       n	Uc  U" 5       nS n
Xi4 H|  n [	        [         R
                  " T5      [         R
                  " T 5      [         R
                  " T5      UR                  UR                  UR                  S9R                  UUUUS9s  $    [        R                  SU
 S35        U" 5       nUR                  5         U$ ! [         a  nUn
 S nAM  S nAff = f)
Nr$   r   r   c            	        > [        [        R                  " T5      [        R                  " T 5      [        R                  " T5      TS S R                  5       TS   R                  5       TS   R                  5       S9$ )NrB   rz   rR   rS   rT   rN   rO   rP   )r?   r   rh   clone)XYdefault_kernel_paramsrR   s   r   _default_gpr'fit_kernel_params.<locals>._default_gprK  so     ++N;$$Q'$$Q')>s)C)I)I)K.r288:+B/557
 	
r   r   )r   r   r   r   z/The optimization of kernel parameters failed: 
z<
The default initial kernel parameters will be used instead.)r7   r?   )r   onesrb   rK   r?   rh   rN   rO   rP   r   r   loggerwarningrl   )r   r   rR   r   r   r   	gpr_cacher   r   default_gpr_cacheerrorgpr_cache_to_useedefault_gprr   s   ```           @r   fit_kernel_paramsr   >  s%    "JJqwwqzA~U]]K
 
 % N	E ':	$//?((+((+-=-Z-Z-::*44 ! #+(?	 !  ;$ NN
:5' BF 	F .K  	E	s   A7D
D"DD")r   r   r7   r   )Ng{Gz?)r   r   r   r   rR   r   r   r   r   r   r   r   r   zGPRegressor | Noner   r   r7   r?   )__doc__
__future__r   r   typingr   r   rZ   r   "optuna._gp.scipy_blas_thread_patchr   optuna._warningsr   optuna.loggingr   collections.abcr	   r   r   optuna._importsr
   r8   r   r   autogradFunctionr    r?   r   r5   r   r   <module>r      s   & #      Y ( % (+ E E	H	 U^^,, <^ ^P %)777 7 5	7
 7 "7 "7 7 7r   