
    hP                         S SK JrJrJr  S SKrS SKJr  S SKJr  S SK	J
r
Jr  S SKJrJr  S SKJr  S SKJrJrJrJrJrJr  S S	KJrJr  S
rSr " S S\5      r " S S\5      r\r " S S\5      r  " S S\5      r!g)    )AnyOptionalUnionN)spaces)nn) SquashedDiagGaussianDistributionStateDependentNoiseDistribution)
BasePolicyContinuousCritic)get_action_dim)BaseFeaturesExtractorCombinedExtractorFlattenExtractor	NatureCNN
create_mlpget_actor_critic_arch)
PyTorchObsSchedule   ic                   \  ^  \ rS rSr% Sr\R                  \S'   \R                  SSSSSS4S\R                  S\R                  S	\\   S
\R                  S\S\\R                     S\S\S\S\S\S\4U 4S jjjrS\\\4   4U 4S jjrS\R.                  4S jrS"S\SS4S jjrS\S\\R.                  \R.                  \\\R.                  4   4   4S jrS#S\S\S\R.                  4S jjrS\S\\R.                  \R.                  4   4S jrS#S\S\S\R.                  4S  jjrS!r U =r!$ )$Actor   a#  
Actor network (policy) for SAC.

:param observation_space: Observation space
:param action_space: Action space
:param net_arch: Network architecture
:param features_extractor: Network to extract features
    (a CNN when using images, a nn.Flatten() layer otherwise)
:param features_dim: Number of features
:param activation_fn: Activation function
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param full_std: Whether to use (n_features x n_actions) parameters
    for the std instead of only (n_features,) when using gSDE.
:param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param clip_mean: Clip the mean output when using gSDE to avoid numerical instability.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
action_spaceFT       @observation_spacenet_archfeatures_extractorfeatures_dimactivation_fnuse_sdelog_std_initfull_std	use_expln	clip_meannormalize_imagesc                   > [         TU ]  UUUUSS9  Xpl        S U l        X0l        XPl        X`l        Xl        Xl        Xl	        Xl
        [        U R                  5      n[        USX65      n[        R                  " U6 U l        [#        U5      S:  a  US   OUnU R                  (       az  [%        XU
SSS9U l        U R&                  R)                  XUS9u  U l        U l        US:  a;  [        R                  " U R*                  [        R.                  " U* US95      U l        g g [1        U5      U l        [        R2                  " X5      U l        [        R2                  " X5      U l        g )	NT)r   r&   squash_outputr   )r#   r$   learn_featuresr(   )
latent_dimlatent_sde_dimr"   g        )min_valmax_val)super__init__r!   sde_features_extractorr   r   r    r"   r$   r#   r%   r   r   r   r   
Sequential	latent_pilenr	   action_distproba_distribution_netmulog_stdHardtanhr   Linear)selfr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   
action_dimlatent_pi_netlast_layer_dim	__class__s                   X/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/sac/policies.pyr0   Actor.__init__2   sM    	1- 	 	
 &*# (*(" "#D$5$56
"<XM6),X):"<<>SWgk D %)$4$4$K$K)Wc %L %!DGT\
 3--iZYb1cd   @
KDii;DG99^@DL    returnc                   > [         TU ]  5       nUR                  [        U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  U R                  S9	5        U$ )N)	r   r   r    r!   r"   r#   r$   r   r%   )r/   _get_constructor_parametersupdatedictr   r   r    r!   r"   r#   r$   r   r%   r;   datar?   s     r@   rE   !Actor._get_constructor_parametersi   sr    w24!.."00!....#'#:#:..
	
 rB   c                     Sn[        U R                  [        5      (       d   U5       eU R                  R                  U R                  5      $ )a   
Retrieve the standard deviation of the action distribution.
Only useful when using gSDE.
It corresponds to ``th.exp(log_std)`` in the normal case,
but is slightly different when using ``expln`` function
(cf StateDependentNoiseDistribution doc).

:return:
z+get_std() is only available when using gSDE)
isinstancer5   r	   get_stdr8   )r;   msgs     r@   rM   Actor.get_std{   sC     <$**,KLLQcQL''55rB   
batch_sizeNc                     Sn[        U R                  [        5      (       d   U5       eU R                  R                  U R                  US9  g)U
Sample new weights for the exploration matrix, when using gSDE.

:param batch_size:
z/reset_noise() is only available when using gSDErP   N)rL   r5   r	   sample_weightsr8   )r;   rP   rN   s      r@   reset_noiseActor.reset_noise   sF     @$**,KLLQcQL'''LrB   obsc                 2   U R                  XR                  5      nU R                  U5      nU R                  U5      nU R                  (       a  X@R
                  [        US94$ U R                  U5      n[        R                  " U[        [        5      nXE0 4$ )z
Get the parameters for the action distribution.

:param obs:
:return:
    Mean, standard deviation and optional keyword arguments.
)
latent_sde)extract_featuresr   r3   r7   r!   r8   rG   thclampLOG_STD_MINLOG_STD_MAX)r;   rW   featuresr3   mean_actionsr8   s         r@   get_action_dist_paramsActor.get_action_dist_params   s|     ((.E.EFNN8,	wwy)<<ty/III,,y)((7K=b((rB   deterministicc                 h    U R                  U5      u  p4nU R                  R                  " X44SU0UD6$ )Nrc   )ra   r5   actions_from_params)r;   rW   rc   r`   r8   kwargss         r@   forwardActor.forward   s:    (,(C(CC(H%v33LqYfqjpqqrB   c                 d    U R                  U5      u  p#nU R                  R                  " X#40 UD6$ N)ra   r5   log_prob_from_params)r;   rW   r`   r8   rf   s        r@   action_log_probActor.action_log_prob   s3    (,(C(CC(H%v44\UfUUrB   observationc                     U " X5      $ rj    r;   rn   rc   s      r@   _predictActor._predict   s    K//rB   )r5   r    r%   r   r#   r3   r8   r"   r7   r   r1   r$   r!      F)"__name__
__module____qualname____firstlineno____doc__r   Box__annotations__r   ReLUSpacelistintModuletypeboolfloatr0   rG   strr   rE   r[   TensorrM   rU   r   tuplera   rg   rl   rr   __static_attributes____classcell__r?   s   @r@   r   r      s   , ** *, !%5A!<<5A jj5A s)	5A
 II5A 5A BII5A 5A 5A 5A 5A 5A 5A 5AnT#s(^ $6 6Mc M$ M)* )ryy"))UYZ]_a_h_hZhUi?i9j )(r: rd rryy r
V: V%		2998L2M V
0J 0t 0PRPYPY 0 0rB   r   c            !         ^  \ rS rSr% Sr\\S'   \\S'   \\S'   S\R                  SSSS	\
SS
\R                  R                  SSS4S\R                  S\R                   S\S\\\\   \\\\   4   4      S\\R2                     S\S\S\S\S\\   S\\\\4      S\S\\R                  R<                     S\\\\4      S\S\4 U 4S jjjrS\SS4S jr S\\\4   4U 4S jjr!S-S \SS4S! jjr"S.S"\\   S\4S# jjr#S.S"\\   S\4S$ jjr$S/S%\%S&\S\RL                  4S' jjr'S/S(\%S&\S\RL                  4S) jjr(S*\SS4S+ jr)S,r*U =r+$ )0	SACPolicy   a  
Policy class (with both actor and critic) for SAC.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param clip_mean: Clip the mean output when using gSDE to avoid numerical instability.
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
:param n_critics: Number of critic networks to create.
:param share_features_extractor: Whether to share or not the features extractor
    between the actor and the critic (this saves computation time)
actorcriticcritic_targetNFr   r   Tr   r   r   lr_scheduler   r    r!   r"   r$   r%   features_extractor_classfeatures_extractor_kwargsr&   optimizer_classoptimizer_kwargs	n_criticsshare_features_extractorc                   > [         TU ]  UUU
UUUSUS9  Uc  SS/n[        U5      u  nnX@l        XPl        U R
                  U R                  UU R                  US.U l        U R                  R                  5       U l	        UUUU	S.nU R                  R                  U5        U R                  R                  5       U l        U R                  R                  UUUS.5        UU l        U R                  U5        g )NT)r   r   r(   r&      )r   r   r   r    r&   )r!   r"   r$   r%   )r   r   r   )r/   r0   r   r   r    r   r   net_argscopyactor_kwargsrF   critic_kwargsr   _build)r;   r   r   r   r   r    r!   r"   r$   r%   r   r   r&   r   r   r   r   
actor_archcritic_arch
sde_kwargsr?   s                       r@   r0   SACPolicy.__init__   s   & 	$%+-- 	 		
 SzH"7"A
K *!%!7!7 --"!// 0
 !MM..0 (""	

 	  ,!]]//1!!&',D	
 )A%K rB   rC   c                 L   U R                  5       U l        U R                  " U R                  R                  5       4SU" S5      0U R                  D6U R                  l        U R                  (       a_  U R                  U R                  R                  S9U l	        U R                  R                  5        VVs/ s H  u  p#SU;  d  M  UPM     nnnO7U R                  S S9U l	        [        U R                  R                  5       5      nU R                  S S9U l        U R                  R                  U R                  R                  5       5        U R                  " U4SU" S5      0U R                  D6U R                  l        U R                  R                  S5        g s  snnf )Nlrru   )r   r   F)
make_actorr   r   
parametersr   	optimizerr   make_criticr   r   named_parametersr   r   load_state_dict
state_dictset_training_mode)r;   r   nameparamcritic_parameterss        r@   r   SACPolicy._build  sk   __&
#33JJ!!# 
1~ 
 ## 


 ((**djj>[>[*\DK ;?++:V:V:X }:X;4\px|\|:X } **d*CDK $T[[%;%;%= > "---F**4;;+A+A+CD $ 4 4!
1~!
 ##!
 	,,U3% !~s   8F F c                 ~  > [         TU ]  5       nUR                  [        U R                  U R
                  S   U R                  S   U R                  S   U R                  S   U R                  S   U R                  S   U R                  U R                  U R                  U R                  U R                  S95        U$ )Nr    r!   r"   r$   r%   r   )r   r    r!   r"   r$   r%   r   r   r   r   r   r   )r/   rE   rF   rG   r   r   r   r   _dummy_scheduler   r   r   r   rH   s     r@   rE   %SACPolicy._get_constructor_parameters8  s    w24"mmO<)))4!..~>++K8++K8,,[9 00 $ 4 4!%!6!6)-)F)F*.*H*H	
  rB   rP   c                 6    U R                   R                  US9  g)rR   rS   N)r   rU   )r;   rP   s     r@   rU   SACPolicy.reset_noiseM  s     	

*5rB   r   c                     U R                  U R                  U5      n[        S0 UD6R                  U R                  5      $ Nrp   )_update_features_extractorr   r   todevice)r;   r   r   s      r@   r   SACPolicy.make_actorU  s8    66t7H7HJ\]$|$''44rB   c                     U R                  U R                  U5      n[        S0 UD6R                  U R                  5      $ r   )r   r   r   r   r   )r;   r   r   s      r@   r   SACPolicy.make_criticY  s8    778J8JL^_0-033DKK@@rB   rW   rc   c                      U R                  XS9$ )N)rc   )rr   )r;   rW   rc   s      r@   rg   SACPolicy.forward]  s    }}S}>>rB   rn   c                 $    U R                  X5      $ rj   )r   rq   s      r@   rr   SACPolicy._predict`  s    zz+55rB   modec                 |    U R                   R                  U5        U R                  R                  U5        Xl        g)z
Put the policy in either training or evaluation mode.

This affects certain modules, such as batch normalisation and dropout.

:param mode: if true, set to training mode, else set to evaluation mode
N)r   r   r   training)r;   r   s     r@   r   SACPolicy.set_training_modec  s-     	

$$T*%%d+rB   )
r    r   r   r   r   r   r   r   r   r   rt   rj   rv   ),rw   rx   ry   rz   r{   r   r}   r   r   r~   r   r[   optimAdamr   r   r|   r   r   r   r   r   rG   r   r   r   r   r   r   r   	Optimizerr0   r   rE   rU   r   r   r   r   rg   rr   r   r   r   r   s   @r@   r   r      s5   8 L## FJ)+ @P>B!%46HHMM59).#@!!<<@! jj@! 	@!
 5cDd3i,@!@AB@! BII@! @! @! @! @! #''<"=@! $,DcN#;@! @! bhh001@! #4S>2@!  !@!" #'#@! @!D4( 4t 4@T#s(^ *6c 6$ 65X6K-L 5X] 5Ah7L.M AYi A?: ?d ?ryy ?6J 6t 6PRPYPY 6
d 
t 
 
rB   r   c            !         ^  \ rS rSrSrS\R                  SSSS\SS\R                  R                  SSS4S	\R                  S
\R                  S\S\\\\   \\\\   4   4      S\\R,                     S\S\S\S\S\\   S\\\\4      S\S\\R                  R6                     S\\\\4      S\S\4 U 4S jjjrSrU =r$ )	CnnPolicyis  :  
Policy class (with both actor and critic) for SAC.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param use_sde: Whether to use State Dependent Exploration or not
:param log_std_init: Initial value for the log standard deviation
:param use_expln: Use ``expln()`` function instead of ``exp()`` when using gSDE to ensure
    a positive standard deviation (cf paper). It allows to keep variance
    above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
:param clip_mean: Clip the mean output when using gSDE to avoid numerical instability.
:param features_extractor_class: Features extractor to use.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
:param n_critics: Number of critic networks to create.
:param share_features_extractor: Whether to share or not the features extractor
    between the actor and the critic (this saves computation time)
NFr   r   Tr   r   r   r   r   r    r!   r"   r$   r%   r   r   r&   r   r   r   r   c                 B   > [         TU ]  UUUUUUUUU	U
UUUUUU5        g rj   r/   r0   r;   r   r   r   r   r    r!   r"   r$   r%   r   r   r&   r   r   r   r   r?   s                    r@   r0   CnnPolicy.__init__  B    & 	$%$!	
rB   rp   )rw   rx   ry   rz   r{   r   r~   r   r[   r   r   r   r   r|   r   r   r   r   r   rG   r   r   r   r   r   r   r   r   r0   r   r   r   s   @r@   r   r   s  sF   > FJ)+ @I>B!%46HHMM59).#$
!<<$
 jj$
 	$

 5cDd3i,@!@AB$
 BII$
 $
 $
 $
 $
 #''<"=$
 $,DcN#;$
 $
 bhh001$
 #4S>2$
  !$
" #'#$
 $
rB   r   c            !         ^  \ rS rSrSrS\R                  SSSS\SS\R                  R                  SSS4S	\R                  S
\R                  S\S\\\\   \\\\   4   4      S\\R,                     S\S\S\S\S\\   S\\\\4      S\S\\R                  R6                     S\\\\4      S\S\4 U 4S jjjrSrU =r$ )MultiInputPolicyi  r   NFr   r   Tr   r   r   r   r   r    r!   r"   r$   r%   r   r   r&   r   r   r   r   c                 B   > [         TU ]  UUUUUUUUU	U
UUUUUU5        g rj   r   r   s                    r@   r0   MultiInputPolicy.__init__  r   rB   rp   )rw   rx   ry   rz   r{   r   r~   r   r[   r   r   r   r   r|   r   r   r   r   r   rG   r   r   r   r   r   r   r   r   r0   r   r   r   s   @r@   r   r     sF   > FJ)+ @Q>B!%46HHMM59).#$
!<<$
 jj$
 	$

 5cDd3i,@!@AB$
 BII$
 $
 $
 $
 $
 #''<"=$
 $,DcN#;$
 $
 bhh001$
 #4S>2$
  !$
" #'#$
 $
rB   r   )"typingr   r   r   torchr[   	gymnasiumr   r   &stable_baselines3.common.distributionsr   r	   !stable_baselines3.common.policiesr
   r   &stable_baselines3.common.preprocessingr   %stable_baselines3.common.torch_layersr   r   r   r   r   r   %stable_baselines3.common.type_aliasesr   r   r^   r]   r   r   	MlpPolicyr   r   rp   rB   r@   <module>r      sw    ' '    t J A  G Y0J Y0xx
 xv 	?
	 ?
D?
y ?
rB   