
    h                        S r SSKrSSKrSSKrSSKJrJr  SSKJr  SSK	J
r
JrJrJr  SSKrSSKrSSKJr  SSKJr  SSKJrJrJrJrJrJrJr  SS	KJrJrJ r J!r!  SS
K"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.J/r/  \" SSS9r0 " S S\Rb                  5      r2 " S S\2\5      r3 " S S\35      r4 " S S\45      r5 " S S\45      r6 " S S\25      r7g)z;Policies: abstract base class and concrete implementations.    N)ABCabstractmethod)partial)AnyOptionalTypeVarUnion)spaces)nn)BernoulliDistributionCategoricalDistributionDiagGaussianDistributionDistributionMultiCategoricalDistributionStateDependentNoiseDistributionmake_proba_distribution)get_action_dimis_image_spacemaybe_transposepreprocess_obs)BaseFeaturesExtractorCombinedExtractorFlattenExtractorMlpExtractor	NatureCNN
create_mlp)
PyTorchObsSchedule)
get_deviceis_vectorized_observationobs_as_tensorSelfBaseModel	BaseModel)boundc                   N  ^  \ rS rSr% Sr\R                  R                  \S'   \	SSS\R                  R                  S4S\R                  S\R                  S\\   S	\\\\4      S
\\   S\S\\R                  R                     S\\\\4      4U 4S jjjr S%S\\\4   S
\\   S\\\4   4S jjrS\4S jrS\S
\S\R0                  4S jrS\\\4   4S jr\S\R8                  4S j5       rS\SS4S jr\S&S\\   S\S\ \R8                  \4   S\4S jj5       r!S\"RF                  SS4S jr$S\"RF                  4S jr%S\SS4S  jr&S!\ \"RF                  \\\"RF                  4   4   S\4S" jr'S!\ \"RF                  \\\"RF                  4   4   S\(\\4   4S# jr)S$r*U =r+$ )'r#   '   as  
The base model object: makes predictions in response to observations.

In the case of policies, the prediction is an action. In the case of critics, it is the
estimated value of the observation.

:param observation_space: The observation space of the environment
:param action_space: The action space of the environment
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param features_extractor: Network to extract features
    (a CNN when using images, a nn.Flatten() layer otherwise)
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
	optimizerNTobservation_spaceaction_spacefeatures_extractor_classfeatures_extractor_kwargsfeatures_extractornormalize_imagesoptimizer_classoptimizer_kwargsc	                 "  > [         T	U ]  5         Uc  0 nUc  0 nXl        X l        XPl        X`l        Xpl        Xl        X0l        X@l	        U(       d?  [        U[        [        45      (       a#  U R                  R                  [        SS95        g g g )NT)normalized_image)super__init__r(   r)   r,   r-   r.   r/   r*   r+   
issubclassr   r   updatedict)
selfr(   r)   r*   r+   r,   r-   r.   r/   	__class__s
            [/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/common/policies.pyr3   BaseModel.__init__?   s     	#!$,(*%!2("4 0. 0(@%)B&J/G)UfIg$h$h**11$2MN %i    
net_kwargsreturnc                     UR                  5       nUc  U R                  5       nUR                  [        X"R                  S95        U$ )a  
Update the network keyword arguments and create a new features extractor object if needed.
If a ``features_extractor`` object is passed, then it will be shared.

:param net_kwargs: the base network keyword arguments, without the ones
    related to features extractor
:param features_extractor: a features extractor object.
    If None, a new object will be created.
:return: The updated keyword arguments
)r,   features_dim)copymake_features_extractorr5   r6   r?   )r7   r<   r,   s      r9   _update_features_extractor$BaseModel._update_features_extractor`   sE      __&
%!%!=!=!?$2DSrSrstr;   c                 P    U R                   " U R                  40 U R                  D6$ )z-Helper method to create a features extractor.)r*   r(   r+   r7   s    r9   rA   !BaseModel.make_features_extractorv   s$    ,,T-C-CftGeGeffr;   obsc                 N    [        XR                  U R                  S9nU" U5      $ )z
Preprocess the observation if needed and extract features.

:param obs: Observation
:param features_extractor: The features extractor to use.
:return: The extracted features
)r-   )r   r(   r-   )r7   rG   r,   preprocessed_obss       r9   extract_featuresBaseModel.extract_featuresz   s*     *#/E/EX\XmXmn!"233r;   c                 T    [        U R                  U R                  U R                  S9$ )z
Get data that need to be saved in order to re-create the model when loading it from disk.

:return: The dictionary to pass to the as kwargs constructor when reconstruction this model.
)r(   r)   r-   )r6   r(   r)   r-   rE   s    r9   _get_constructor_parameters%BaseModel._get_constructor_parameters   s.     "44** "22
 	
r;   c                 ^    U R                  5        H  nUR                  s  $    [        S5      $ )zInfer which device this policy lives on by inspecting its parameters.
If it has no parameters, the 'cpu' device is used as a fallback.

:return:cpu)
parametersdevicer   )r7   params     r9   rR   BaseModel.device   s*     __&E<< '%  r;   pathc                 p    [         R                  " U R                  5       U R                  5       S.U5        g)z/
Save model to a given location.

:param path:
)
state_dictdataN)thsaverW   rM   )r7   rU   s     r9   rZ   BaseModel.save   s(     	t0$:Z:Z:\]_cdr;   clsrR   c                     [        U5      n[        R                  " XSS9nU " S0 US   D6nUR                  US   5        UR	                  U5        U$ )zj
Load model from path.

:param path:
:param device: Device on which the policy should be loaded.
:return:
F)map_locationweights_onlyrX   rW    )r   rY   loadload_state_dictto)r\   rU   rR   saved_variablesmodels        r9   ra   BaseModel.load   sX     F# ''$%P .of-.ol;<r;   vectorc                     [         R                  R                  R                  [         R                  " U[         R
                  U R                  S9U R                  5       5        g)z3
Load parameters from a 1D vector.

:param vector:
)dtyperR   N)rY   r   utilsvector_to_parameters	as_tensorfloatrR   rQ   )r7   rg   s     r9   load_from_vectorBaseModel.load_from_vector   s>     	((fBHHUYU`U`)acgcrcrctur;   c                     [         R                  R                  R                  U R	                  5       5      R                  5       R                  5       R                  5       $ )z2
Convert the parameters to a 1D vector.

:return:
)rY   r   rj   parameters_to_vectorrQ   detachrP   numpyrE   s    r9   rq   BaseModel.parameters_to_vector   s?     uu{{//0ABIIKOOQWWYYr;   modec                 &    U R                  U5        g)z
Put the policy in either training or evaluation mode.

This affects certain modules, such as batch normalisation and dropout.

:param mode: if true, set to training mode, else set to evaluation mode
N)train)r7   ru   s     r9   set_training_modeBaseModel.set_training_mode   s     	

4r;   observationc                    Sn[        U[        5      (       a  [        U R                  [        R                  5      (       d   SU R                   35       eUR                  5        H<  u  p4U R                  R                  U   nU=(       d    [        [        XE5      U5      nM>     U$ [        [        XR                  5      U R                  5      nU$ )a6  
Check whether or not the observation is vectorized,
apply transposition to image (so that they are channel-first) if needed.
This is used in DQN when sampling random action (epsilon-greedy policy)

:param observation: the input observation to check
:return: whether the given observation is vectorized or not
F8The observation provided is a dict but the obs space is )
isinstancer6   r(   r
   Dictitemsr    r   )r7   rz   vectorized_envkeyrG   	obs_spaces         r9   r    #BaseModel.is_vectorized_observation   s     k4((&&  cI$J`J`Iabc  (--/ 2299#>	!/!x3L_]`Mlnw3x 0  7-C-CDdF\F\N r;   c                    Sn[        U[        5      (       a  [        U R                  [        R                  5      (       d   SU R                   35       e[
        R                  " U5      nUR                  5        H  u  p4U R                  R                  U   n[        U5      (       a  [        XE5      nO[        R                  " U5      nU=(       d    [        Xe5      nUR                  S/U R                  U   R                  Q75      X'   M     OF[        U R                  5      (       a  [        XR                  5      nO[        R                  " U5      n[        U[        5      (       d>  [        XR                  5      nUR                  S/U R                  R                  Q75      n[        XR                   5      nXr4$ )a,  
Convert an input observation to a PyTorch tensor that can be fed to a model.
Includes sugar-coating to handle different observations (e.g. normalizing images).

:param observation: the input observation
:return: The observation as PyTorch tensor
    and whether the observation is vectorized or not
Fr|   )r}   r6   r(   r
   r~   r@   deepcopyr   r   r   nparrayr    reshapeshaper!   rR   )r7   rz   r   r   rG   r   obs_
obs_tensors           r9   obs_to_tensorBaseModel.obs_to_tensor   sx    k4((&&  cI$J`J`Iabc  --4K'--/ 2299#>	!),,*3:D88C=D!/!]3LT3]#'<<0Xd6L6LS6Q6W6W0X#Y  0 D2233 *+7M7MNK ((;/K+t,,6{DZDZ[N%--r.QD4J4J4P4P.QRK";<
))r;   )r)   r,   r*   r+   r-   r(   r.   r/   N)auto),__name__
__module____qualname____firstlineno____doc__rY   optim	Optimizer__annotations__r   Adamr
   Spacetyper   r   r6   strr   boolr3   rB   rA   r   TensorrJ   rM   propertyrR   rZ   classmethodr"   r	   ra   r   ndarrayrn   rq   rx   r    tupler   __static_attributes____classcell__r8   s   @r9   r#   r#   '   s   * xx!!! AQ>B>B!%46HHMM59O!<<O llO #''<"=	O
 $,DcN#;O %%:;O O bhh001O #4S>2O OH ?CcN %%:; 
c3h	,g)> g	4J 	4DY 	4^`^g^g 	4
T#s(^ 
 !		 ! !e e e $}% S %		3:O ]j  (vrzz vd vZbjj Zd t U2::tCQSQ[Q[OG\;\5] bf .)*rzz4RZZ;P/P)Q )*V[\fhl\lVm )* )*r;   c                   D  ^  \ rS rSr% Sr\\S'   SS.S\4U 4S jjjr\	S\
S	\
4S
 j5       r\S	\4S j5       r\	SS\R                  S\
S	S4S jj5       r\SS\S\S	\R(                  4S jj5       r   SS\\R0                  \\\R0                  4   4   S\\\R0                  S4      S\\R0                     S\S	\\R0                  \\\R0                  S4      4   4
S jjrS\R0                  S	\R0                  4S jrS\R0                  S	\R0                  4S jrSr U =r!$ )
BasePolicyi  a_  The base policy object.

Parameters are mostly the same as `BaseModel`; additions are documented below.

:param args: positional arguments passed through to `BaseModel`.
:param kwargs: keyword arguments passed through to `BaseModel`.
:param squash_output: For continuous actions, whether the output is squashed
    or not using a ``tanh()`` function.
r,   F)squash_outputr   c                2   > [         TU ]  " U0 UD6  Xl        g r   )r2   r3   _squash_output)r7   r   argskwargsr8   s       r9   r3   BasePolicy.__init__%  s    $)&)+r;   progress_remainingr=   c                     A g)z#(float) Useful for pickling policy.        r`   )r   s    r9   _dummy_scheduleBasePolicy._dummy_schedule)  s
     r;   c                     U R                   $ )z (bool) Getter for squash_output.r   rE   s    r9   r   BasePolicy.squash_output/  s     """r;   modulegainNc                    [        U [        R                  [        R                  45      (       a\  [        R                  R                  U R                  US9  U R                  b&  U R                  R                  R                  S5        ggg)z1
Orthogonal initialization (used in PPO and A2C)
r   Nr   )
r}   r   LinearConv2dinitorthogonal_weightbiasrX   fill_)r   r   s     r9   init_weightsBasePolicy.init_weights4  se    
 fryy"))455GGD9{{&  &&s+ ' 6r;   rz   deterministicc                     g)a\  
Get the action according to the policy for a given observation.

By default provides a dummy implementation -- not all BasePolicy classes
implement this, e.g. if they are a Critic in an Actor-Critic method.

:param observation:
:param deterministic: Whether to use stochastic or deterministic actions
:return: Taken action according to the policy
Nr`   r7   rz   r   s      r9   _predictBasePolicy._predict>  s    r;   state.episode_startc                 j   U R                  S5        [        U[        5      (       a2  [        U5      S:X  a#  [        US   [        5      (       a  [        S5      eU R                  U5      u  pV[        R                  " 5          U R                  XTS9nSSS5        WR                  5       R                  5       R                  S/U R                  R                  Q75      n[        U R                  [        R                   5      (       ab  U R"                  (       a  U R%                  U5      nO?[&        R(                  " XpR                  R*                  U R                  R,                  5      nU(       d0  [        U[&        R.                  5      (       d   eUR1                  SS	9nXr4$ ! , (       d  f       GN= f)
ae  
Get the policy action from an observation (and optional hidden state).
Includes sugar-coating to handle different observations (e.g. normalizing images).

:param observation: the input observation
:param state: The last hidden states (can be None, used in recurrent policies)
:param episode_start: The last masks (can be None, used in recurrent policies)
    this correspond to beginning of episodes,
    where the hidden states of the RNN must be reset.
:param deterministic: Whether or not to return deterministic actions.
:return: the model's action and the next hidden state
    (used in recurrent policies)
F      a  You have passed a tuple to the predict() function instead of a Numpy array or a Dict. You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) vs `obs = vec_env.reset()` (SB3 VecEnv). See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-apir   Nr   r   )axis)rx   r}   r   lenr6   
ValueErrorr   rY   no_gradr   rP   rs   r   r)   r   r
   Boxr   unscale_actionr   cliplowhighr   squeeze)r7   rz   r   r   r   r   r   actionss           r9   predictBasePolicy.predictK  sR   * 	u% k5))c+.>!.C
S^_`SacgHhHhW  &*%7%7%D"
ZZ\mmJmLG  ++-%%'//0Nd6G6G6M6M0NOd''44!!--g6 '''+<+<+@+@$BSBSBXBXY grzz2222oo1o-G~' \s   F##
F2actionc                     [        U R                  [        R                  5      (       d   SU R                   35       eU R                  R                  U R                  R
                  p2SX-
  X2-
  -  -  S-
  $ )z
Rescale the action from [low, high] to [-1, 1]
(no need for symmetric action space)

:param action: Action to scale
:return: Scaled action
zETrying to scale an action using an action space that is not a Box(): g       @      ?r}   r)   r
   r   r   r   )r7   r   r   r   s       r9   scale_actionBasePolicy.scale_action  s}     vzz
 
 	gRSWSdSdRef	g 
 %%))4+<+<+A+ATv|
34s::r;   scaled_actionc                     [        U R                  [        R                  5      (       d   SU R                   35       eU R                  R                  U R                  R
                  p2USUS-   -  X2-
  -  -   $ )z
Rescale the action from [-1, 1] to [low, high]
(no need for symmetric action space)

:param scaled_action: Action to un-scale
zGTrying to unscale an action using an action space that is not a Box(): g      ?r   r   )r7   r   r   r   s       r9   r   BasePolicy.unscale_action  s     vzz
 
 	iTUYUfUfTgh	i 
 %%))4+<+<+A+ATc]S01TZ@AAr;   r   r   F)NNF)"r   r   r   r   r   r   r   r   r3   staticmethodrm   r   r   r   r   Moduler   r   r   rY   r   r   r	   r   r   r6   r   r   r   r   r   r   r   r   r   s   @r9   r   r     s    .-49 ,T , , E e  
 #t # # ,RYY ,e ,D , , 
J 
t 
PRPYPY 
 
 37.2#72::tCO'<<=7 bjj#o./7  

+	7
 7 
rzz8E"**c/$:;;	<7r;2:: ;"** ;BBJJ B2:: B Br;   r   c            &         ^  \ rS rSrSrS\R                  SSSSSS\SSS\R                  R                  S4S\R                  S\R                  S	\S
\\\\   \\\\   4   4      S\\R*                     S\S\S\S\S\S\S\\   S\\\\4      S\S\S\\R                  R4                     S\\\\4      4"U 4S jjjrS\\\4   4U 4S jjrS-S\SS4S jjrS.S jrS	\SS4S jrS/S\R@                  S \S\!\R@                  \R@                  \R@                  4   4S! jjr" S0S\#S"\\   S\\R@                  \!\R@                  \R@                  4   4   4U 4S# jjjr$S$\R@                  S\%4S% jr&S/S&\#S \S\R@                  4S' jjr'S\#S(\R@                  S\!\R@                  \R@                  \\R@                     4   4S) jr(S\#S\%4U 4S* jjr)S\#S\R@                  4U 4S+ jjr*S,r+U =r,$ )1ActorCriticPolicyi  aY  
Policy class for actor-critic algorithms (has both policy and value prediction).
Used by A2C, PPO and the likes.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param ortho_init: Whether to use or not orthogonal initialization
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param full_std: Whether to use (n_features x n_actions) parameters
    for the std instead of only (n_features,) when using gSDE
:param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param squash_output: Whether to squash the output using a tanh function,
    this allows to ensure boundaries when using gSDE.
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param share_features_extractor: If True, the features extractor is shared between the policy and value networks.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
NTFr   r(   r)   lr_schedulenet_archactivation_fn
ortho_inituse_sdelog_std_initfull_std	use_explnr   r*   r+   share_features_extractorr-   r.   r/   c                 \  > Uc%  0 nU[         R                  R                  :X  a  SUS'   [        TU ]  UUUUUUUUS9  [        U[        5      (       aB  [        U5      S:  a3  [        US   [        5      (       a  [        R                  " S5        US   nUc  U[        :X  a  / nO[        SS/SS/S9nX@l        XPl        X`l        Xl        U R!                  5       U l        U R"                  R$                  U l        U R                  (       a#  U R"                  U l        U R"                  U l        O&U R"                  U l        U R!                  5       U l        Xl        S nU(       a  U(       d   S5       eU(       a  U	UU
S	S
.nXpl        UU l        [1        X'US9U l        U R5                  U5        g )Ngh㈵>eps)r.   r/   r   r-   r   zAs shared layers in the mlp_extractor are removed since SB3 v1.8.0, you should now pass directly a dictionary and not a list (net_arch=dict(pi=..., vf=...) instead of net_arch=[dict(pi=..., vf=...)])@   )pivfzCsquash_output=True is only available when using gSDE (use_sde=True)F)r   r   r   learn_features)r   dist_kwargs)rY   r   r   r2   r3   r}   listr   r6   warningswarnr   r   r   r   r   rA   r,   r?   pi_features_extractorvf_features_extractorr   r   r   r   action_dist_build)r7   r(   r)   r   r   r   r   r   r   r   r   r   r*   r+   r   r-   r.   r/   r   r8   s                      r9   r3   ActorCriticPolicy.__init__  s   ( #!"((--/*. '$%+-'- 	 		
 h%%#h-!*;
8TU;X\@]@]MMa  {H '94B8R9 *$(@%"&">">"@ 33@@(()-)@)@D&)-)@)@D&)-)@)@D&)-)E)E)GD&(!'y4yy2$!.&"'	K & 3<^ijK r;   r=   c                   > [         TU ]  5       nU R                  =(       d    [        R                  " S 5      nUR                  [        U R                  U R                  U R                  U R                  US   US   US   U R                  U R                  U R                  U R                  U R                  U R                   S95        U$ )Nc                      g r   r`   r`   r;   r9   <lambda>?ActorCriticPolicy._get_constructor_parameters.<locals>.<lambda>  s    RVr;   r   r   r   )r   r   r   r   r   r   r   r   r   r.   r/   r*   r+   )r2   rM   r   collectionsdefaultdictr5   r6   r   r   r   r   r   r   r.   r/   r*   r+   )r7   rX   default_none_kwargsr8   s      r9   rM   -ActorCriticPolicy._get_constructor_parameters  s    w24"..W+2I2I,2W"00!..1/B,Z8-k: 00?? $ 4 4!%!6!6)-)F)F*.*H*H	
" r;   n_envsc                     [        U R                  [        5      (       d   S5       eU R                  R                  U R                  US9  g)z@
Sample new weights for the exploration matrix.

:param n_envs:
z/reset_noise() is only available when using gSDE)
batch_sizeN)r}   r   r   sample_weightslog_std)r7   r
  s     r9   reset_noiseActorCriticPolicy.reset_noise1  sA     $**,KLLNL'''Hr;   c                 v    [        U R                  U R                  U R                  U R                  S9U l        g)zI
Create the policy and value networks.
Part of the layers can be shared.
)r   r   rR   N)r   r?   r   r   rR   mlp_extractorrE   s    r9   _build_mlp_extractor&ActorCriticPolicy._build_mlp_extractor:  s1     *]],,;;	
r;   c                 6   U R                  5         U R                  R                  n[        U R                  [
        5      (       a1  U R                  R                  X R                  S9u  U l        U l	        O[        U R                  [        5      (       a2  U R                  R                  X"U R                  S9u  U l        U l	        Ob[        U R                  [        [        [        45      (       a  U R                  R                  US9U l        O[        SU R                   S35      e[        R                   " U R                  R"                  S5      U l        U R&                  (       a  U R(                  [*        R,                  " S5      U R                  [*        R,                  " S5      U R                  SU R$                  S0nU R.                  (       dP  X0R(                  	 [*        R,                  " S5      X0R0                  '   [*        R,                  " S5      X0R2                  '   UR5                  5        H(  u  pEUR7                  [9        U R:                  US	95        M*     U R<                  " U R?                  5       4S
U" S5      0U R@                  D6U l!        g)z
Create the networks and the optimizer.

:param lr_schedule: Learning rate schedule
    lr_schedule(1) is the initial learning rate
)
latent_dimr   )r  latent_sde_dimr   )r  zUnsupported distribution 'z'.r   r   g{Gz?r   lrN)"r  r  latent_dim_pir}   r   r   proba_distribution_netr   
action_netr  r   r   r   r   NotImplementedErrorr   r   latent_dim_vf	value_netr   r,   r   sqrtr   r   r   r   applyr   r   r.   rQ   r/   r'   )r7   r   r  module_gainsr   r   s         r9   r   ActorCriticPolicy._buildI  s    	!!#**88d&&(@AA,0,<,<,S,S(7H7H -T -)DOT\ ((*IJJ,0,<,<,S,S(UYUfUf -T -)DOT\ ((+BD`bw*xyy"..EEQ^E_DO%(B4CSCSBTTV&WXX4#5#5#C#CQG ?? ''""BGGAJ	L 00 !!8!89;=771:778;=771:778 , 2 2 4WT%6%6TBC !5 --doo.?lKPQNlVZVkVklr;   rG   r   c                    U R                  U5      nU R                  (       a  U R                  U5      u  pEO:Uu  pgU R                  R                  U5      nU R                  R	                  U5      nU R                  U5      nU R                  U5      n	U	R                  US9n
U	R                  U
5      nU
R                  S/U R                  R                  Q75      n
XU4$ )z
Forward pass in all the networks (actor and critic)

:param obs: Observation
:param deterministic: Whether to sample or use deterministic actions
:return: action, value and log probability of the action
r   r   )rJ   r   r  forward_actorforward_criticr  _get_action_dist_from_latentget_actionslog_probr   r)   r   )r7   rG   r   features	latent_pi	latent_vfpi_featuresvf_featuresvaluesdistributionr   r(  s               r9   forwardActorCriticPolicy.forward|  s     ((-((#'#5#5h#? Iy'/$K**88EI**99+FI	*88C***G((1//2"@(9(9(?(?"@A((r;   r,   c                   > U R                   (       a"  [        TU ]	  Xc  U R                  5      $ U5      $ Ub  [        R
                  " S[        5        [        TU ]	  XR                  5      n[        TU ]	  XR                  5      nX44$ )a]  
Preprocess the observation if needed and extract features.

:param obs: Observation
:param features_extractor: The features extractor to use. If None, then ``self.features_extractor`` is used.
:return: The extracted features. If features extractor is not shared, returns a tuple with the
    features for the actor and the features for the critic.
zYProvided features_extractor will be ignored because the features extractor is not shared.)	r   r2   rJ   r,   r   r   UserWarningr   r   )r7   rG   r,   r,  r-  r8   s        r9   rJ   "ActorCriticPolicy.extract_features  s     ((7+CLf1H1Hl~!-o
  '238R8RSK'238R8RSK++r;   r*  c                    U R                  U5      n[        U R                  [        5      (       a%  U R                  R	                  X R
                  5      $ [        U R                  [        5      (       a  U R                  R	                  US9$ [        U R                  [        5      (       a  U R                  R	                  US9$ [        U R                  [        5      (       a  U R                  R	                  US9$ [        U R                  [        5      (       a&  U R                  R	                  X R
                  U5      $ [        S5      e)z
Retrieve action distribution given the latent codes.

:param latent_pi: Latent code for the actor
:return: Action distribution
)action_logitszInvalid action distribution)r  r}   r   r   proba_distributionr  r   r   r   r   r   )r7   r*  mean_actionss      r9   r&  .ActorCriticPolicy._get_action_dist_from_latent  s	    y1d&&(@AA##66|\\RR((*ABB##66\6RR((*FGG##66\6RR((*?@@##66\6RR((*IJJ##66|\\S\]]:;;r;   rz   c                 >    U R                  U5      R                  US9$ )z
Get the action according to the policy for a given observation.

:param observation:
:param deterministic: Whether to use stochastic or deterministic actions
:return: Taken action according to the policy
r   )get_distributionr'  r   s      r9   r   ActorCriticPolicy._predict  s#     $$[1==M=ZZr;   r   c                 p   U R                  U5      nU R                  (       a  U R                  U5      u  pEO:Uu  pgU R                  R                  U5      nU R                  R	                  U5      nU R                  U5      nUR                  U5      n	U R                  U5      n
UR                  5       nXU4$ )z
Evaluate actions according to the current policy,
given the observations.

:param obs: Observation
:param actions: Actions
:return: estimated value, log likelihood of taking those actions
    and entropy of the action distribution.
)	rJ   r   r  r$  r%  r&  r(  r  entropy)r7   rG   r   r)  r*  r+  r,  r-  r/  r(  r.  r>  s               r9   evaluate_actions"ActorCriticPolicy.evaluate_actions  s     ((-((#'#5#5h#? Iy'/$K**88EI**99+FI88C((1	*&&(((r;   c                    > [         TU ]  XR                  5      nU R                  R	                  U5      nU R                  U5      $ )zl
Get the current policy distribution given the observations.

:param obs:
:return: the action distribution.
)r2   rJ   r   r  r$  r&  )r7   rG   r)  r*  r8   s       r9   r;  "ActorCriticPolicy.get_distribution  sA     7+C1K1KL&&44X>	00;;r;   c                    > [         TU ]  XR                  5      nU R                  R	                  U5      nU R                  U5      $ )z
Get the estimated values according to the current policy given the observations.

:param obs: Observation
:return: the estimated values.
)r2   rJ   r   r  r%  r  )r7   rG   r)  r+  r8   s       r9   predict_values ActorCriticPolicy.predict_values  s?     7+C1K1KL&&55h?	~~i((r;   )r   r  r   r   r?   r,   r  r   r  r   r'   r   r   r   r   r  r   r   )r=   Nr   r   )-r   r   r   r   r   r   Tanhr   rY   r   r   r
   r   r   r   r	   r   intr6   r   r   r   r   rm   r   r   r   r3   rM   r  r  r   r   r   r0  r   rJ   r   r&  r   r?  r;  rD  r   r   r   s   @r9   r   r     s   H FJ)+!#@P>B)-!%46HHMM59%W!!<<W! llW! 	W!
 5cDd3i,@!@ABW! BIIW! W! W! W! W! W! W! #''<"=W! $,DcN#;W! #'W!  !W!" bhh001#W!$ #4S>2%W! W!rT#s(^ 0I# Id I
1m( 1mt 1mf)299 )T )eBIIWYW`W`bdbkbkLkFl )2 VZ,,3;<Q3R,	ryy%		299 455	6, ,0<bii <L <2[J [t [PRPYPY [)J ) )uRYYXZXaXacklnlulucvMvGw )0	<J 	<< 	<	)* 	) 	) 	)r;   r   c            &         ^  \ rS rSrSrS\R                  SSSSSS\SSS\R                  R                  S4S\R                  S\R                  S	\S
\\\\   \\\\   4   4      S\\R*                     S\S\S\S\S\S\S\\   S\\\\4      S\S\S\\R                  R4                     S\\\\4      4"U 4S jjjrSrU =r$ )ActorCriticCnnPolicyi  a]  
CNN policy class for actor-critic algorithms (has both policy and value prediction).
Used by A2C, PPO and the likes.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param ortho_init: Whether to use or not orthogonal initialization
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param full_std: Whether to use (n_features x n_actions) parameters
    for the std instead of only (n_features,) when using gSDE
:param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param squash_output: Whether to squash the output using a tanh function,
    this allows to ensure boundaries when using gSDE.
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param share_features_extractor: If True, the features extractor is shared between the policy and value networks.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
NTFr   r(   r)   r   r   r   r   r   r   r   r   r   r*   r+   r   r-   r.   r/   c                 D   > [         TU ]  UUUUUUUUU	U
UUUUUUU5        g r   r2   r3   r7   r(   r)   r   r   r   r   r   r   r   r   r   r*   r+   r   r-   r.   r/   r8   s                     r9   r3   ActorCriticCnnPolicy.__init__  E    ( 	$%$#	
r;   r`   )r   r   r   r   r   r   rF  r   rY   r   r   r
   r   r   r   r	   r   rG  r6   r   r   r   r   rm   r   r   r   r3   r   r   r   s   @r9   rI  rI    sT   H FJ)+!#@I>B)-!%46HHMM59%&
!<<&
 ll&
 	&

 5cDd3i,@!@AB&
 BII&
 &
 &
 &
 &
 &
 &
 #''<"=&
 $,DcN#;&
 #'&
  !&
" bhh001#&
$ #4S>2%&
 &
r;   rI  c            &         ^  \ rS rSrSrS\R                  SSSSSS\SSS\R                  R                  S4S\R                  S\R                  S	\S
\\\\   \\\\   4   4      S\\R,                     S\S\S\S\S\S\S\\   S\\\\4      S\S\S\\R                  R6                     S\\\\4      4"U 4S jjjrSrU =r$ )MultiInputActorCriticPolicyiG  av  
MultiInputActorClass policy class for actor-critic algorithms (has both policy and value prediction).
Used by A2C, PPO and the likes.

:param observation_space: Observation space (Tuple)
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param ortho_init: Whether to use or not orthogonal initialization
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param full_std: Whether to use (n_features x n_actions) parameters
    for the std instead of only (n_features,) when using gSDE
:param use_expln: Use ``expln()`` function instead of ``exp()`` to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param squash_output: Whether to squash the output using a tanh function,
    this allows to ensure boundaries when using gSDE.
:param features_extractor_class: Uses the CombinedExtractor
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param share_features_extractor: If True, the features extractor is shared between the policy and value networks.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
NTFr   r(   r)   r   r   r   r   r   r   r   r   r   r*   r+   r   r-   r.   r/   c                 D   > [         TU ]  UUUUUUUUU	U
UUUUUUU5        g r   rK  rL  s                     r9   r3   $MultiInputActorCriticPolicy.__init__g  rN  r;   r`   )r   r   r   r   r   r   rF  r   rY   r   r   r
   r~   r   r   r   r	   r   rG  r6   r   r   r   r   rm   r   r   r   r3   r   r   r   s   @r9   rP  rP  G  sT   H FJ)+!#@Q>B)-!%46HHMM59%&
!;;&
 ll&
 	&

 5cDd3i,@!@AB&
 BII&
 &
 &
 &
 &
 &
 &
 #''<"=&
 $,DcN#;&
 #'&
  !&
" bhh001#&
$ #4S>2%&
 &
r;   rP  c                   v  ^  \ rS rSr% Sr\\S'   \R                  SSS4S\	R                  S\	R                  S\\   S\S	\S
\\R                     S\S\S\4U 4S jjjrS\R&                  S\R&                  S\\R&                  S4   4S jrS\R&                  S\R&                  S\R&                  4S jrSrU =r$ )ContinuousCritici  a{  
Critic network(s) for DDPG/SAC/TD3.
It represents the action-state value function (Q-value function).
Compared to A2C/PPO critics, this one represents the Q-value
and takes the continuous action as input. It is concatenated with the state
and then fed to the network which outputs a single value: Q(s, a).
For more recent algorithms like SAC/TD3, multiple networks
are created to give different estimates.

By default, it creates two critic networks used to reduce overestimation
thanks to clipped Q-learning (cf TD3 paper).

:param observation_space: Observation space
:param action_space: Action space
:param net_arch: Network architecture
:param features_extractor: Network to extract features
    (a CNN when using images, a nn.Flatten() layer otherwise)
:param features_dim: Number of features
:param activation_fn: Activation function
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param n_critics: Number of critic networks to create.
:param share_features_extractor: Whether the features extractor is shared or not
    between the actor and the critic (this saves computation time)
r,   Tr   r(   r)   r   r?   r   r-   	n_criticsr   c
                 >  > [         TU ]  UUUUS9  [        U R                  5      n
Xl        Xl        / U l        [        U5       HU  n[        XZ-   SX65      n[        R                  " U6 nU R                  SU 3U5        U R                  R                  U5        MW     g )N)r,   r-   r   qf)r2   r3   r   r)   r   rU  
q_networksranger   r   
Sequential
add_moduleappend)r7   r(   r)   r   r,   r?   r   r-   rU  r   
action_dimidx
q_net_listq_netr8   s                 r9   r3   ContinuousCritic.__init__  s     	1-	 	 	
 $D$5$56
(@%"+-#C#L$=q(ZJMM:.EOObJ.OO""5)	 $r;   rG   r   r=   .c                 $  ^ [         R                  " U R                  (       + 5         U R                  XR                  5      nS S S 5        [         R
                  " WU/SS9m[        U4S jU R                   5       5      $ ! , (       d  f       ND= f)Nr   dimc              3   2   >#    U  H  o" T5      v   M     g 7fr   r`   ).0r`  qvalue_inputs     r9   	<genexpr>+ContinuousCritic.forward.<locals>.<genexpr>  s     FoUU<((os   )rY   set_grad_enabledr   rJ   r,   catr   rX  )r7   rG   r   r)  rg  s       @r9   r0  ContinuousCritic.forward  sl       T%B%B!BC,,S2I2IJH Dvvx1q9FdooFFF DCs   B
Bc                     [         R                  " 5          U R                  XR                  5      nSSS5        U R                  S   " [         R
                  " WU/SS95      $ ! , (       d  f       N8= f)z
Only predict the Q-value using the first network.
This allows to reduce computation when all the estimates are not needed
(e.g. when updating the policy in TD3).
Nr   r   rc  )rY   r   rJ   r,   rX  rk  )r7   rG   r   r)  s       r9   
q1_forwardContinuousCritic.q1_forward  sT     ZZ\,,S2I2IJH q!"&&(G)<!"DEE \s   A$$
A2)rU  rX  r   )r   r   r   r   r   r   r   r   ReLUr
   r   r   r   rG  r   r   r   r3   rY   r   r   r0  rn  r   r   r   s   @r9   rT  rT    s    4 .- *,!%)-*!<<* jj* s)	*
 2* * BII* * * #'* *<G299 Gryy GU299c>=R GFbii F")) F		 F Fr;   rT  )8r   r  r@   r   abcr   r   	functoolsr   typingr   r   r   r	   rs   r   torchrY   	gymnasiumr
   r   &stable_baselines3.common.distributionsr   r   r   r   r   r   r   &stable_baselines3.common.preprocessingr   r   r   r   %stable_baselines3.common.torch_layersr   r   r   r   r   r   %stable_baselines3.common.type_aliasesr   r   stable_baselines3.common.utilsr   r    r!   r"   r   r#   r   r   rI  rP  rT  r`   r;   r9   <module>r{     s    A    #  0 0       s r  G _ _{;n*		 n*bEBC EBP[)
 [)|
F
, F
RF
"3 F
RKFy KFr;   