
    h8                         S SK JrJrJr  S SKrS SKJr  S SKJr  S SK	J
r
Jr  S SKJr  S SKJrJrJrJrJrJr  S SKJrJr   " S	 S
\
5      r " S S\
5      r\r " S S\5      r " S S\5      rg)    )AnyOptionalUnionN)spaces)nn)
BasePolicyContinuousCritic)get_action_dim)BaseFeaturesExtractorCombinedExtractorFlattenExtractor	NatureCNN
create_mlpget_actor_critic_arch)
PyTorchObsSchedulec                   J  ^  \ rS rSrSr\R                  S4S\R                  S\R                  S\
\   S\R                  S\S	\\R                     S
\4U 4S jjjrS\\\4   4U 4S jjrS\R*                  S\R*                  4S jrSS\S\S\R*                  4S jjrSrU =r$ )Actor   a  
Actor network (policy) for TD3.

:param observation_space: Observation space
:param action_space: Action space
:param net_arch: Network architecture
:param features_extractor: Network to extract features
    (a CNN when using images, a nn.Flatten() layer otherwise)
:param features_dim: Number of features
:param activation_fn: Activation function
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
Tobservation_spaceaction_spacenet_archfeatures_extractorfeatures_dimactivation_fnnormalize_imagesc                    > [         T
U ]  UUUUSS9  X0l        XPl        X`l        [        U R                  5      n[        XXX6SS9n	[        R                  " U	6 U l
        g )NT)r   r   squash_output)r   )super__init__r   r   r   r
   r   r   r   
Sequentialmu)selfr   r   r   r   r   r   r   
action_dim	actor_net	__class__s             X/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/td3/policies.pyr    Actor.__init__#   sh     	1- 	 	
 !(*#D$5$56
|`de	--+    returnc           	         > [         TU ]  5       nUR                  [        U R                  U R
                  U R                  U R                  S95        U$ )N)r   r   r   r   )r   _get_constructor_parametersupdatedictr   r   r   r   r#   datar&   s     r'   r,   !Actor._get_constructor_parameters>   sM    w24!.."00#'#:#:		
 r)   obsc                 Z    U R                  XR                  5      nU R                  U5      $ N)extract_featuresr   r"   )r#   r2   featuress      r'   forwardActor.forwardK   s'    ((.E.EFwwx  r)   observationdeterministicc                     U " U5      $ r4    r#   r9   r:   s      r'   _predictActor._predictP   s     K  r)   )r   r   r"   r   F)__name__
__module____qualname____firstlineno____doc__r   ReLUr   SpaceBoxlistintModuletypeboolr    r.   strr   r,   thTensorr7   r   r>   __static_attributes____classcell__r&   s   @r'   r   r      s    * *,!%,!<<, jj, s)	,
 II, , BII, , ,6T#s(^ !299 ! !
!J !t !PRPYPY ! !r)   r   c                   n  ^  \ rS rSr% Sr\\S'   \\S'   \\S'   \\S'   S\R                  \
SS\R                  R                  SS	S
4	S\R                  S\R                   S\S\\\\   \\\\   4   4      S\\R2                     S\\   S\\\\4      S\S\\R                  R:                     S\\\\4      S\S\4U 4S jjjrS\SS4S jrS\\\4   4U 4S jjr S%S\\   S\4S jjr!S%S\\   S\4S jjr"S&S\#S\S\RH                  4S  jjr%S&S\#S\S\RH                  4S! jjr&S"\SS4S# jr'S$r(U =r)$ )'	TD3PolicyV     
Policy class (with both actor and critic) for TD3.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
:param n_critics: Number of critic networks to create.
:param share_features_extractor: Whether to share or not the features extractor
    between the actor and the critic (this saves computation time)
actoractor_targetcriticcritic_targetNT   Fr   r   lr_scheduler   r   features_extractor_classfeatures_extractor_kwargsr   optimizer_classoptimizer_kwargs	n_criticsshare_features_extractorc                   > [         TU ]  UUUUU	U
SUS9  Uc  U[        :X  a  SS/nOSS/n[        U5      u  pX@l        XPl        U R                  U R                  UU R
                  US.U l        U R                  R                  5       U l
        U R                  R                  5       U l        U R                  R                  UUUS.5        Xl        U R                  U5        g )NT)r`   ra   r   r      i  i,  )r   r   r   r   r   )rb   r   rc   )r   r    r   r   r   r   r   r   net_argscopyactor_kwargscritic_kwargsr-   rc   _build)r#   r   r   r]   r   r   r^   r_   r   r`   ra   rb   rc   
actor_archcritic_archr&   s                  r'   r    TD3Policy.__init__r   s     	$%+-- 	 		
 '94::"7"A
 *!%!7!7 --"!// 0
 !MM..0!]]//1!!&',D	
 )A%K r)   r*   c                    U R                  S S9U l        U R                  S S9U l        U R                  R                  U R                  R	                  5       5        U R
                  " U R                  R                  5       4SU" S5      0U R                  D6U R                  l        U R                  (       aQ  U R                  U R                  R                  S9U l        U R                  U R                  R                  S9U l        O(U R                  S S9U l        U R                  S S9U l        U R                  R                  U R                  R	                  5       5        U R
                  " U R                  R                  5       4SU" S5      0U R                  D6U R                  l        U R                  R                  S5        U R                  R                  S5        g )N)r   lr   F)
make_actorrX   rY   load_state_dict
state_dictr`   
parametersra   	optimizerrc   make_criticr   rZ   r[   set_training_mode)r#   r]   s     r'   rj   TD3Policy._build   s    ___=
 OOtOD))$***?*?*AB#33JJ!!# 
1~ 
 ## 


 ((**djj>[>[*\DK "&!1!1TEVEVEiEi!1!jD **d*CDK!%!1!1T!1!JD**4;;+A+A+CD $ 4 4KK""$!
1~!
 ##!
 	++E2,,U3r)   c                 $  > [         TU ]  5       nUR                  [        U R                  U R
                  S   U R                  S   U R                  U R                  U R                  U R                  U R                  U R                  S9	5        U$ )Nr   rb   )	r   r   rb   r]   r`   ra   r^   r_   rc   )r   r,   r-   r.   r   rf   ri   _dummy_scheduler`   ra   r^   r_   rc   r/   s     r'   r,   %TD3Policy._get_constructor_parameters   s    w24"mmO<,,[9 00 $ 4 4!%!6!6)-)F)F*.*H*H)-)F)F
	
 r)   r   c                     U R                  U R                  U5      n[        S0 UD6R                  U R                  5      $ Nr<   )_update_features_extractorrh   r   todevice)r#   r   rh   s      r'   rq   TD3Policy.make_actor   s8    66t7H7HJ\]$|$''44r)   c                     U R                  U R                  U5      n[        S0 UD6R                  U R                  5      $ r}   )r~   ri   r	   r   r   )r#   r   ri   s      r'   rv   TD3Policy.make_critic   s8    778J8JL^_0-033DKK@@r)   r9   r:   c                      U R                  XS9$ )N)r:   )r>   r=   s      r'   r7   TD3Policy.forward   s    }}[}FFr)   c                 $    U R                  U5      $ r4   )rX   r=   s      r'   r>   TD3Policy._predict   s     zz+&&r)   modec                 |    U R                   R                  U5        U R                  R                  U5        Xl        g)z
Put the policy in either training or evaluation mode.

This affects certain modules, such as batch normalisation and dropout.

:param mode: if true, set to training mode, else set to evaluation mode
N)rX   rw   rZ   training)r#   r   s     r'   rw   TD3Policy.set_training_mode   s-     	

$$T*%%d+r)   )r   rX   rh   rY   rZ   ri   r[   r   rf   rc   r   r4   r@   )*rA   rB   rC   rD   rE   r   __annotations__r	   r   rF   r   rO   optimAdamr   rG   rH   r   r   r   rI   rJ   r.   rN   rL   rK   r   r   rM   	Optimizerr    rj   r,   rq   rv   r   rP   r7   r>   rw   rQ   rR   rS   s   @r'   rU   rU   V   s   , L## FJ)+@P>B!%46HHMM59).8!!<<8! jj8! 	8!
 5cDd3i,@!@AB8! BII8! #''<"=8! $,DcN#;8! 8! bhh0018! #4S>28! 8! #'8! 8!t$4( $4t $4LT#s(^ $5X6K-L 5X] 5Ah7L.M AYi AG: Gd Gryy G'J 't 'PRPYPY '

d 
t 
 
r)   rU   c                   n  ^  \ rS rSrSrS\R                  \SS\R                  R                  SSS4	S\R                  S\R                  S	\S
\\\\   \\\\   4   4      S\\R,                     S\\   S\\\\4      S\S\\R                  R4                     S\\\\4      S\S\4U 4S jjjrSrU =r$ )	CnnPolicyi  rW   NTr\   Fr   r   r]   r   r   r^   r_   r   r`   ra   rb   rc   c                 :   > [         TU ]  UUUUUUUUU	U
UU5        g r4   r   r    r#   r   r   r]   r   r   r^   r_   r   r`   ra   rb   rc   r&   s                r'   r    CnnPolicy.__init__  6     	$%$	
r)   r<   )rA   rB   rC   rD   rE   r   rF   r   rO   r   r   r   rG   rH   r   r   r   rI   rJ   r.   rN   rL   rK   r   r   rM   r   r    rQ   rR   rS   s   @r'   r   r     s   6 FJ)+@I>B!%46HHMM59).
!<<
 jj
 	

 5cDd3i,@!@AB
 BII
 #''<"=
 $,DcN#;
 
 bhh001
 #4S>2
 
 #'
 
r)   r   c                   n  ^  \ rS rSrSrS\R                  \SS\R                  R                  SSS4	S\R                  S\R                  S	\S
\\\\   \\\\   4   4      S\\R,                     S\\   S\\\\4      S\S\\R                  R4                     S\\\\4      S\S\4U 4S jjjrSrU =r$ )MultiInputPolicyi:  a  
Policy class (with both actor and critic) for TD3 to be used with Dict observation spaces.

:param observation_space: Observation space
:param action_space: Action space
:param lr_schedule: Learning rate schedule (could be constant)
:param net_arch: The specification of the policy and value networks.
:param activation_fn: Activation function
:param features_extractor_class: Features extractor to use.
:param features_extractor_kwargs: Keyword arguments
    to pass to the features extractor.
:param normalize_images: Whether to normalize images or not,
     dividing by 255.0 (True by default)
:param optimizer_class: The optimizer to use,
    ``th.optim.Adam`` by default
:param optimizer_kwargs: Additional keyword arguments,
    excluding the learning rate, to pass to the optimizer
:param n_critics: Number of critic networks to create.
:param share_features_extractor: Whether to share or not the features extractor
    between the actor and the critic (this saves computation time)
NTr\   Fr   r   r]   r   r   r^   r_   r   r`   ra   rb   rc   c                 :   > [         TU ]  UUUUUUUUU	U
UU5        g r4   r   r   s                r'   r    MultiInputPolicy.__init__Q  r   r)   r<   )rA   rB   rC   rD   rE   r   rF   r   rO   r   r   r   DictrH   r   r   r   rI   rJ   r.   rN   rL   rK   r   r   rM   r   r    rQ   rR   rS   s   @r'   r   r   :  s   6 FJ)+@Q>B!%46HHMM59).
!;;
 jj
 	

 5cDd3i,@!@AB
 BII
 #''<"=
 $,DcN#;
 
 bhh001
 #4S>2
 
 #'
 
r)   r   )typingr   r   r   torchrO   	gymnasiumr   r   !stable_baselines3.common.policiesr   r	   &stable_baselines3.common.preprocessingr
   %stable_baselines3.common.torch_layersr   r   r   r   r   r   %stable_baselines3.common.type_aliasesr   r   r   rU   	MlpPolicyr   r   r<   r)   r'   <module>r      sf    ' '    J A  G?!J ?!Dh
 hV 	3
	 3
l3
y 3
r)   