
    h/-                         S SK JrJrJrJrJr  S SKrS SKr	S SK
Jr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJr  S S	KJrJrJr  S S
KJrJr  S SKJ r J!r!J"r"J#r#J$r$  \" SSS9r% " S S\5      r&g)    )AnyClassVarOptionalTypeVarUnionN)spaces)
functional)ReplayBuffer)ActionNoise)OffPolicyAlgorithm)
BasePolicyContinuousCritic)GymEnvMaybeCallbackSchedule)get_parameters_by_namepolyak_update)Actor	CnnPolicy	MlpPolicyMultiInputPolicy	TD3PolicySelfTD3TD3)boundc            3       j  ^  \ rS rSr% Sr\\\S.r\	\
\\\   4      \S'   \\S'   \\S'   \\S'   \\S'   \\S	'                          S3S\\\\   4   S\\\4   S\\\4   S\S\S\S\S\S\\\\\4   4   S\S\\   S\\\      S\\
\\4      S\S\S\S\S\S\S\\   S\\
\\4      S\S \\   S!\\R<                  \4   S"\42U 4S# jjjrS4U 4S% jjr S4S& jr!S5S\S\S$S
4S' jjr"     S6S(\#S)\S*\$S+\S,\S-\S.\S$\#4U 4S/ jjjr%S$\&\   4U 4S0 jjr'S$\\&\   \&\   4   4S1 jr(S2r)U =r*$ )7r      a  
Twin Delayed DDPG (TD3)
Addressing Function Approximation Error in Actor-Critic Methods.

Original implementation: https://github.com/sfujim/TD3
Paper: https://arxiv.org/abs/1802.09477
Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html

:param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
:param env: The environment to learn from (if registered in Gym, can be str)
:param learning_rate: learning rate for adam optimizer,
    the same learning rate will be used for all networks (Q-Values, Actor and Value function)
    it can be a function of the current progress remaining (from 1 to 0)
:param buffer_size: size of the replay buffer
:param learning_starts: how many steps of the model to collect transitions for before learning starts
:param batch_size: Minibatch size for each gradient update
:param tau: the soft update coefficient ("Polyak update", between 0 and 1)
:param gamma: the discount factor
:param train_freq: Update the model every ``train_freq`` steps. Alternatively pass a tuple of frequency and unit
    like ``(5, "step")`` or ``(2, "episode")``.
:param gradient_steps: How many gradient steps to do after each rollout (see ``train_freq``)
    Set to ``-1`` means to do as many gradient steps as steps done in the environment
    during the rollout.
:param action_noise: the action noise type (None by default), this can help
    for hard exploration problem. Cf common.noise for the different action noise type.
:param replay_buffer_class: Replay buffer class to use (for instance ``HerReplayBuffer``).
    If ``None``, it will be automatically selected.
:param replay_buffer_kwargs: Keyword arguments to pass to the replay buffer on creation.
:param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
    at a cost of more complexity.
    See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
:param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
:param policy_delay: Policy and target networks will only be updated once every policy_delay steps
    per training steps. The Q values will be updated policy_delay more often (update every training step).
:param target_policy_noise: Standard deviation of Gaussian noise added to target policy
    (smoothing noise)
:param target_noise_clip: Limit for absolute value of target policy smoothing noise.
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
    the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation. See :ref:`td3_policies`
:param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
    debug messages
:param seed: Seed for the pseudo random generators
:param device: Device (cpu, cuda, ...) on which the code should be run.
    Setting it to auto, the code will be run on the GPU if possible.
:param _init_setup_model: Whether or not to build the network at the creation of the instance
)r   r   r   policy_aliasespolicyactoractor_targetcriticcritic_targetNenvlearning_ratebuffer_sizelearning_starts
batch_sizetaugamma
train_freqgradient_stepsaction_noisereplay_buffer_classreplay_buffer_kwargsoptimize_memory_usagen_stepspolicy_delaytarget_policy_noisetarget_noise_clipstats_window_sizetensorboard_logpolicy_kwargsverboseseeddevice_init_setup_modelc                    > [         TU ]  UUUUUUUUU	U
UUUUUUUUUUUS[        R                  4SS9  UU l        UU l        UU l        U(       a  U R                  5         g g )NFT)r-   r.   r/   r0   r1   r7   r5   r6   r8   r:   r9   sde_supportsupported_action_spacessupport_multi_env)super__init__r   Boxr2   r4   r3   _setup_model)selfr   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   	__class__s                             S/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/td3/td3.pyrA   TD3.__init__P   s    8 	% 3!5"7'/+%+ZZM"1 	 	
6 )!2#6      returnc                 "  > [         TU ]  5         U R                  5         [        U R                  S/5      U l        [        U R                  S/5      U l        [        U R                  S/5      U l	        [        U R                  S/5      U l        g )Nrunning_)r@   rC   _create_aliasesr   r    actor_batch_norm_statsr"   critic_batch_norm_statsr!   actor_batch_norm_stats_targetr#   critic_batch_norm_stats_targetrD   rE   s    rF   rC   TD3._setup_model   sv    &<TZZ*&V#'=dkkJ<'X$-CDDUDUXbWc-d*.DTEWEWZdYe.f+rH   c                     U R                   R                  U l        U R                   R                  U l        U R                   R                  U l        U R                   R                  U l        g N)r   r    r!   r"   r#   )rD   s    rF   rL   TD3._create_aliases   sH    [[&&
 KK44kk((![[66rH   c                 *
  ^ U R                   R                  S5        U R                  U R                  R                  U R
                  R                  /5        / / pC[        U5       GH  nU =R                  S-  sl        U R                  R                  X R                  S9nUR                  b  UR                  OU R                  n[        R                  " 5          UR                  R!                  5       R"                  R%                  SU R&                  5      nUR)                  U R*                  * U R*                  5      nU R-                  UR.                  5      U-   R)                  SS5      n	[        R0                  " U R3                  UR.                  U	5      SS9n
[        R4                  " U
SSS9u  pUR6                  SUR8                  -
  U-  U
-  -   mS S S 5        U R                  UR:                  UR                  5      n[=        U4S jU 5       5      n[?        U[        R@                  5      (       d   eURC                  URE                  5       5        U R
                  R                  RG                  5         URI                  5         U R
                  R                  RK                  5         U R                  U RL                  -  S:X  d  GMn  U R
                  RO                  UR:                  U R                  UR:                  5      5      RQ                  5       * nURC                  URE                  5       5        U R                  R                  RG                  5         URI                  5         U R                  R                  RK                  5         [S        U R
                  RU                  5       U R2                  RU                  5       U RV                  5        [S        U R                  RU                  5       U R,                  RU                  5       U RV                  5        [S        U RX                  U RZ                  S	5        [S        U R\                  U R^                  S	5        GM     U R`                  Rc                  S
U R                  SS9  [e        U5      S:  a0  U R`                  Rc                  S[f        RP                  " U5      5        U R`                  Rc                  S[f        RP                  " U5      5        g ! , (       d  f       GN1= f)NT   )r$   r   )dim)rY   keepdimc              3   R   >#    U  H  n[         R                  " UT5      v   M     g 7frT   )Fmse_loss).0	current_qtarget_q_valuess     rF   	<genexpr>TD3.train.<locals>.<genexpr>   s!     gVfajjODDVfs   $'g      ?ztrain/n_updatestensorboard)excludeztrain/actor_lossztrain/critic_loss)4r   set_training_mode_update_learning_rater    	optimizerr"   range
_n_updatesreplay_buffersample_vec_normalize_env	discountsr*   thno_gradactionsclonedatanormal_r3   clampr4   r!   next_observationscatr#   minrewardsdonesobservationssum
isinstanceTensorappenditem	zero_gradbackwardstepr2   
q1_forwardmeanr   
parametersr)   rN   rP   rM   rO   loggerrecordlennp)rD   r,   r(   actor_lossescritic_losses_replay_datarm   noisenext_actionsnext_q_valuescurrent_q_valuescritic_loss
actor_lossr`   s                 @rF   train	TD3.train   s   %%d+ 	""DJJ$8$8$++:O:O#PQ&("m~&AOOq O,,33JD[D[3\K1<1F1F1R--X\XbXbI#++11388@@DD\D\]T%;%;$;T=S=ST $ 1 1+2O2O PSX X__`bdef !#t'9'9+:W:WYe'flm n#%66-Q#M "-"5"5[=N=N9NR[8[^k8k"k   ${{;+C+C[EXEXY gVfggKk2995555  !1!1!34 KK!!++-  "KK!!&&( !2!22a7"kk44[5M5MtzzZeZrZrOstyy{{
##JOO$56 

$$..0##%

$$))+dkk4468J8J8U8U8WY]YaYabdjj335t7H7H7S7S7UW[W_W_`d::D<_<_aded994;];]_bc] '` 	,doo}U|q KK1277<3HI.0FGY s   C=T
T	rD   total_timestepscallbacklog_intervaltb_log_namereset_num_timestepsprogress_barc           	      (   > [         TU ]  UUUUUUS9$ )N)r   r   r   r   r   r   )r@   learn)rD   r   r   r   r   r   r   rE   s          rF   r   	TD3.learn   s-     w}+%# 3%  
 	
rH   c                 *   > [         TU ]  5       / SQ-   $ )N)r    r"   r!   r#   )r@   _excluded_save_paramsrQ   s    rF   r   TD3._excluded_save_params   s    w,.1eeerH   c                     / SQnU/ 4$ )N)r   zactor.optimizerzcritic.optimizer )rD   state_dictss     rF   _get_torch_save_paramsTD3._get_torch_save_params   s    GBrH   )r    rM   rO   r!   r"   rN   rP   r#   r2   r4   r3   )gMbP?i@B d      g{Gzt?gGz?rW   rW   NNNFrW      g?g      ?r   NNr   NautoT)rI   N)r   )N   r   TF)+__name__
__module____qualname____firstlineno____doc__r   r   r   r   r   dictstrtyper   __annotations__r   r   r   r   r   floatr   inttupler   r   r
   r   boolrn   r:   rA   rC   rL   r   r   r   r   listr   r   __static_attributes____classcell__)rE   s   @rF   r   r      s   /d ,=NHT#tJ'7"789 
 L## 15$"23.2<@9=&+%(#&!$)-26"(."&5< c4	?*+<  63;<  UH_-	< 
 <  <  <  <  <  #uS#X./<  <  {+<  &d<&89<  'tCH~6<   $<   !< " #< $ #%< & !'< ( )< * "#+< ,  S#X/-< . /< 0 sm1< 2 biin%3< 4  5<  < |g7;HC ;HS ;H4 ;H@ #' $("


  
 	

 
 "
 
 

 
$ftCy fd3ic.B(C  rH   )'typingr   r   r   r   r   numpyr   torchrn   	gymnasiumr   torch.nnr	   r\    stable_baselines3.common.buffersr
   stable_baselines3.common.noiser   -stable_baselines3.common.off_policy_algorithmr   !stable_baselines3.common.policiesr   r   %stable_baselines3.common.type_aliasesr   r   r   stable_baselines3.common.utilsr   r   stable_baselines3.td3.policiesr   r   r   r   r   r   r   r   rH   rF   <module>r      sM    : :    $ 9 6 L J Q Q P c c
)5
)^
 ^rH   