
    h@k                     0   S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	J
r
Jr  S SKrS SKrS SKJr  S SKJr  S SKJrJrJr  S SKJr  S SKJrJr  S S	KJr  S S
KJ r J!r!  S SK"J#r#J$r$J%r%J&r&J'r'J(r(  S SK)J*r*J+r+  S SK,J-r-  S SK.J/r/  \
" SSS9r0 " S S\5      r1g)    N)deepcopy)AnyOptionalTypeVarUnion)spaces)BaseAlgorithm)DictReplayBufferNStepReplayBufferReplayBuffer)BaseCallback)ActionNoiseVectorizedActionNoise)
BasePolicy)load_from_pklsave_to_pkl)GymEnvMaybeCallbackRolloutReturnSchedule	TrainFreqTrainFrequencyUnit)	safe_meanshould_collect_more_steps)VecEnv)HerReplayBufferSelfOffPolicyAlgorithmOffPolicyAlgorithm)boundc            <       Z  ^  \ rS rSr% Sr\R                  R                  \S'                            SAS\	\
\\   4   S\	\\
4   S\	\\4   S\S	\S
\S\S\S\	\\\\
4   4   S\S\\   S\\\      S\\\
\4      S\S\S\\\
\4      S\S\\
   S\S\	\R0                  \
4   S\S\S\\   S\S\S\S\S \\\\R4                     S!4      48U 4S" jjjrSBS$ jrSBS% jrS&\	\
\R>                  \ RB                  4   S#S4S' jr" SCS&\	\
\R>                  \ RB                  4   S(\S#S4S) jjr#    SDS*\S+\$S,\S-\
S.\S#\\\%4   4U 4S/ jjjr&     SES0\'S*\S+\$S1\S-\
S,\S.\S#\'4S2 jjr(S\S
\S#S4S3 jr)  SFS	\S\\   S4\S#\\*RV                  \*RV                  4   4S5 jjr,SBS6 jr-SBS7 jr.S8\S9\*RV                  S:\	\*RV                  \\
\*RV                  4   4   S;\*RV                  S<\*RV                  S=\/\\
\4      S#S4S> jr0   SGS\1S+\%S\2S8\S\\   S	\S1\\   S#\34S? jjr4S@r5U =r6$ )Hr      a{  
The base for Off-Policy algorithms (ex: SAC/TD3)

:param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
:param env: The environment to learn from
            (if registered in Gym, can be str. Can be None for loading trained models)
:param learning_rate: learning rate for the optimizer,
    it can be a function of the current progress remaining (from 1 to 0)
:param buffer_size: size of the replay buffer
:param learning_starts: how many steps of the model to collect transitions for before learning starts
:param batch_size: Minibatch size for each gradient update
:param tau: the soft update coefficient ("Polyak update", between 0 and 1)
:param gamma: the discount factor
:param train_freq: Update the model every ``train_freq`` steps. Alternatively pass a tuple of frequency and unit
    like ``(5, "step")`` or ``(2, "episode")``.
:param gradient_steps: How many gradient steps to do after each rollout (see ``train_freq``)
    Set to ``-1`` means to do as many gradient steps as steps done in the environment
    during the rollout.
:param action_noise: the action noise type (None by default), this can help
    for hard exploration problem. Cf common.noise for the different action noise type.
:param replay_buffer_class: Replay buffer class to use (for instance ``HerReplayBuffer``).
    If ``None``, it will be automatically selected.
:param replay_buffer_kwargs: Keyword arguments to pass to the replay buffer on creation.
:param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
    at a cost of more complexity.
    See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
:param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
:param policy_kwargs: Additional arguments to be passed to the policy on creation
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
    the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
    debug messages
:param device: Device on which the code should run.
    By default, it will try to use a Cuda compatible device and fallback to cpu
    if it is not possible.
:param support_multi_env: Whether the algorithm supports training
    with multiple environments (as in A2C)
:param monitor_wrapper: When creating an environment, whether to wrap it
    or not in a Monitor wrapper.
:param seed: Seed for the pseudo random generators
:param use_sde: Whether to use State Dependent Exploration (SDE)
    instead of action noise exploration (default: False)
:param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
    Default: -1 (only sample at the beginning of the rollout)
:param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
    during the warm up phase (before learning starts)
:param sde_support: Whether the model support gSDE or not
:param supported_action_spaces: The action spaces supported by the algorithm.
actorNpolicyenvlearning_ratebuffer_sizelearning_starts
batch_sizetaugamma
train_freqgradient_stepsaction_noisereplay_buffer_classreplay_buffer_kwargsoptimize_memory_usagen_stepspolicy_kwargsstats_window_sizetensorboard_logverbosedevicesupport_multi_envmonitor_wrapperseeduse_sdesde_sample_frequse_sde_at_warmupsde_supportsupported_action_spaces.c                 :  > [         TU ]  UUUUUUUUUUUUUUS9  X@l        X`l        XPl        Xpl        Xl        Xl        Xl        Xl	        S U l
        Xl        U=(       d    0 U l        Xl        Xl        U(       a  U R                  U R                   S'   UU l        g )N)r#   r$   r%   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r>   r:   )super__init__r&   r(   r'   r)   r*   r,   r-   r0   replay_bufferr.   r/   r1   r+   r:   r2   r<   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   	__class__s                                g/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/common/off_policy_algorithm.pyrA   OffPolicyAlgorithm.__init__Q   s    > 	''/+/++$; 	 	
  '$.
,(%:"59#6 $8$>B! % ,0LLDy)!2    returnc                 l   [        U R                  [        5      (       dp  U R                  n[        U[        5      (       d  US4n US   [	        US   5      4n[        US   [        5      (       d  [        SUS    35      e[        U6 U l        gg! [
         a  n[        SUS    S35      UeSnAff = f)zF
Convert `train_freq` parameter (int or tuple)
to a TrainFreq object.
stepr      zEThe unit of the `train_freq` must be either 'step' or 'episode' not 'z'!Nz9The frequency of `train_freq` must be an integer and not )
isinstancer+   r   tupler   
ValueErrorint)rC   r+   es      rE   _convert_train_freq&OffPolicyAlgorithm._convert_train_freq   s    
 $//955J j%00(&1
(m-?
1-NO
 jmS11 #\]ghi]j\k!lmm'4DO# 6   [\fgh\i[jjlms   B 
B3B..B3c                 <   U R                  5         U R                  U R                  5        U R                  c  [	        U R
                  [        R                  5      (       a#  [        U l        U R                  S:X  d   S5       eOYU R                  S:  a>  [        U l        U R                  R                  U R                  U R                  S.5        O[        U l        U R                  c  U R                  R!                  5       n[#        U R                  [$        5      (       a#  U R&                  c   S5       eU R&                  US'   U R                  " U R(                  U R
                  U R*                  4U R,                  U R.                  U R0                  S.UD6U l        U R2                  " U R
                  U R*                  U R4                  40 U R6                  D6U l        U R8                  R;                  U R,                  5      U l        U R=                  5         g )NrK   zAN-step returns are not supported for Dict observation spaces yet.)r1   r*   z9You must pass an environment when using `HerReplayBuffer`r$   )r6   n_envsr0   )_setup_lr_scheduleset_random_seedr9   r.   rL   observation_spacer   Dictr
   r1   r   r/   updater*   r   rB   copy
issubclassr   r$   r&   action_spacer6   rT   r0   policy_classlr_scheduler2   r#   torQ   )rC   r/   s     rE   _setup_modelOffPolicyAlgorithm._setup_model   s   !TYY'##+$00&++>>+;(||q(m*mm(!+<())00T\\TXT^T^1_`+7(% $(#<#<#A#A#C $22ODDxx+h-hh+.2hh$U+!%!9!9  &&!!" {{{{&*&@&@" '"D ''""
   	
 kknnT[[1 	  "rG   pathc                 l    U R                   c   S5       e[        XR                   U R                  5        g)z
Save the replay buffer as a pickle file.

:param path: Path to the file where the replay buffer should be saved.
    if path is a str or pathlib.Path, the path is automatically created if necessary.
Nz The replay buffer is not defined)rB   r   r5   )rC   rb   s     rE   save_replay_buffer%OffPolicyAlgorithm.save_replay_buffer   s0     !!-Q/QQ-D,,dll;rG   truncate_last_trajc                 v   [        XR                  5      U l        [        U R                  [        5      (       d   S5       e[        U R                  S5      (       dJ  SU R                  l        [        R                  " U R                  R                  5      U R                  l
        [        U R                  [        5      (       aZ  U R                  c   S5       eU R                  R                  U R                  5        U(       a  U R                  R                  5         U R                  U R                  l        g)aw  
Load a replay buffer from a pickle file.

:param path: Path to the pickled replay buffer.
:param truncate_last_traj: When using ``HerReplayBuffer`` with online sampling:
    If set to ``True``, we assume that the last trajectory in the replay buffer was finished
    (and truncate it).
    If set to ``False``, we assume that we continue the same trajectory (same episode).
z6The replay buffer must inherit from ReplayBuffer classhandle_timeout_terminationFNzFYou must pass an environment at load time when using `HerReplayBuffer`)r   r5   rB   rL   r   hasattrrh   np
zeros_likedonestimeoutsr   r$   set_envtruncate_last_trajectoryr6   )rC   rb   rf   s      rE   load_replay_buffer%OffPolicyAlgorithm.load_replay_buffer   s     +4>$,,l;;u=uu; t))+GHH<AD9*,--8J8J8P8P*QD'd((/::88'q)qq'&&txx0!"";;= %)KK!rG   total_timestepscallbackreset_num_timestepstb_log_nameprogress_barc                   > U R                   nU R                  =(       a6    U=(       a-    USL=(       a"    UR                  =(       d    UR                  S:  nU(       aF  [        R
                  " S5        Uc   eUR                  S-
  UR                  -  nSUR                  U'   U R                  c   S5       eU R                  bh  U R                  R                  S:  aN  [        U R                  [        5      (       d/  [        U R                  U R                  R                  5      U l	        [        T	U ]9  UUUUU5      $ )z
cf `BaseAlgorithm`.
Nr   zThe last trajectory in the replay buffer will be truncated, see https://github.com/DLR-RM/stable-baselines3/issues/46.You should use `reset_num_timesteps=False` or `optimize_memory_usage=False`to avoid that issue.rK   Tz:You must set the environment before calling _setup_learn())rB   r0   fullposwarningswarnr&   rl   r$   r-   num_envsrL   r   r@   _setup_learn)
rC   rr   rs   rt   ru   rv   rB   rf   ry   rD   s
            rE   r}   OffPolicyAlgorithm._setup_learn  s.    ** && >#>T)> ##<}'8'81'<	 	 MM' !,,, $$q(M,E,EEC'+M$xx#a%aa# )!!A%t002GHH 5d6G6GIZIZ [Dw#
 	
rG   rC   log_intervalc           
         U R                  UUUUU5      u  pUR                  [        5       [        5       5        U R                  c   S5       e[        U R                  [        5      (       d   eU R                  U:  a  U R                  U R                  U R                  U R                  UU R                  U R                  US9nUR                  (       d  OU R                  S:  ab  U R                  U R                  :  aH  U R                  S:  a  U R                  OUR                  nUS:  a  U R!                  U R"                  US9  U R                  U:  a  M  UR%                  5         U $ )Nz3You must set the environment before calling learn())r+   r-   rs   r'   rB   r   r   )r(   r,   )r}   on_training_startlocalsglobalsr$   rL   r+   r   num_timestepscollect_rolloutsr-   r'   rB   continue_trainingr,   episode_timestepstrainr(   on_training_end)	rC   rr   rs   r   ru   rt   rv   rolloutr,   s	            rE   learnOffPolicyAlgorithm.learn8  sL    %)$5$5%
! 	""68WY7xx#Z%ZZ#$//95555  ?2++??!..! $ 4 4"00) , G ,,!!A%$*<*<t?S?S*S 9=8K8Kq8P!4!4V]VoVo!A%JJ$//.JY)   ?2, 	  "rG   c                     [        5       e)z[
Sample the replay buffer and do the updates
(gradient descent and update target networks)
)NotImplementedError)rC   r,   r(   s      rE   r   OffPolicyAlgorithm.trainh  s    
 "##rG   rT   c                 z   U R                   U:  aj  U R                  (       a  U R                  (       dH  [        R                  " [        U5       Vs/ s H  o@R                  R                  5       PM     sn5      nO0U R                  c   S5       eU R                  U R                  SS9u  pT[        U R                  [        R                  5      (       a]  U R                  R                  U5      nUb  [        R                  " Xb" 5       -   SS5      nUnU R                  R!                  U5      nX4$ UnUnX4$ s  snf )a  
Sample an action according to the exploration policy.
This is either done by sampling the probability distribution of the policy,
or sampling a random action (from a uniform distribution over the action space)
or by adding noise to the deterministic output.

:param action_noise: Action noise that will be used for exploration
    Required for deterministic policy (e.g. TD3). This can also be used
    in addition to the stochastic policy for SAC.
:param learning_starts: Number of steps before learning for the warm-up phase.
:param n_envs:
:return: action to take in the environment
    and scaled action that will be stored in the replay buffer.
    The two differs when the action space is not normalized (bounds are not [-1, 1]).
zself._last_obs was not setF)deterministicrK   )r   r:   r<   rj   arrayranger\   sample	_last_obspredictrL   r   Boxr#   scale_actionclipunscale_action)	rC   r'   r-   rT   _unscaled_actionscaled_actionbuffer_actionactions	            rE   _sample_action!OffPolicyAlgorithm._sample_actiono  s   , /$J`J` hhERXM'ZMq(9(9(@(@(BM'Z[O
 >>-K/KK-!%dnnE!RO d''44 KK44_EM ' "(FA N *M[[//>F
 $$ ,M"F$$/ ([s   #D8c           
      J   U R                   c   eU R                  c   e[        [        R                  " 5       U R
                  -
  S-  [        R                  R                  5      n[        U R                  U R                  -
  U-  5      nU R                  R                  SU R                  SS9  [        U R                   5      S:  a  [        U R                   S   5      S:  a  U R                  R                  S[!        U R                    Vs/ s H  o3S   PM	     sn5      5        U R                  R                  S	[!        U R                    Vs/ s H  o3S
   PM	     sn5      5        U R                  R                  SU5        U R                  R                  S[        U5      SS9  U R                  R                  SU R                  SS9  U R"                  (       aP  U R                  R                  SU R$                  R'                  5       R)                  5       R+                  5       5        [        U R                  5      S:  a/  U R                  R                  S[!        U R                  5      5        U R                  R-                  U R                  S9  gs  snf s  snf )z
Write log data.
Ng    eAztime/episodestensorboard)excluder   zrollout/ep_rew_meanrzrollout/ep_len_meanlztime/fpsztime/time_elapsedztime/total_timestepsz	train/stdzrollout/success_rate)rJ   )ep_info_bufferep_success_buffermaxtimetime_ns
start_timesys
float_infoepsilonrO   r   _num_timesteps_at_startloggerrecord_episode_numlenr   r:   r"   get_stdmeanitemdump)rC   time_elapsedfpsep_infos       rE   	dump_logsOffPolicyAlgorithm.dump_logs  s
    ""...%%111DLLNT__<CS^^E[E[\4%%(D(DDTU?D,=,=}Ut""#a'C0C0CA0F,G!,KKK4i]a]p]p@q]pRY]p@q6rsKK4i]a]p]p@q]pRY]p@q6rs:s+.L0A=Y143E3E}]<<KK{TZZ-?-?-A,G,G,I,N,N,PQt%%&*KK5yAWAW7XYd001 Ar@qs   JJ c                     g)z
Method called after each step in the environment.
It is meant to trigger DQN target network update
but can be used for other purposes
N )rC   s    rE   _on_stepOffPolicyAlgorithm._on_step  s     	rG   rB   r   new_obsrewardrl   infosc                    U R                   b5  U R                   R                  5       nU R                   R                  5       nOU R                  X4sU l        px[        U5      n	[        U5       H  u  pU(       d  M  Xj   R                  S5      c  M$  [        U	[        5      (       aQ  Xj   S   nU R                   b  U R                   R                  U5      nU	R                  5        H  nX   X   U
'   M     M  Xj   S   X'   U R                   c  M  U R                   R                  XSS24   5      X'   M     UR                  U R                  U	UUUU5        X0l        U R                   b  Xpl        gg)a  
Store transition in the replay buffer.
We store the normalized action and the unnormalized observation.
It also handles terminal observations (because VecEnv resets automatically).

:param replay_buffer: Replay buffer object where to store the transition.
:param buffer_action: normalized action
:param new_obs: next observation in the current episode
    or first observation of the episode (when dones is True)
:param reward: reward for the current transition
:param dones: Termination signal
:param infos: List of additional information about the transition.
    It may contain the terminal observations and information about timeout.
Nterminal_observation)_vec_normalize_envget_original_obsget_original_rewardr   _last_original_obsr   	enumerategetrL   dictunnormalize_obskeysadd)rC   rB   r   r   r   rl   r   new_obs_reward_next_obsidone	next_obs_keys                 rE   _store_transition$OffPolicyAlgorithm._store_transition  s_   0 ""...??AH--AACG :>6D#X H% !'GAt%;<Hh-- %)? @I..:$($;$;$K$KI$V	'}}+4>a(  / #((+A"BHK..:&*&=&=&M&MhZ[W[n&] (  	##	
 !"".&.# /rG   c                    U R                   R                  S5        Su  p[        U[        5      (       d   S5       eUR                  S:  d   S5       eUR
                  S:  a%  UR                  [        R                  :X  d   S5       eU R                  (       a%  U R                  R                  UR
                  5        UR                  5         Sn
[        X8U	5      (       Ga  U R                  (       aG  U R                  S:  a7  XR                  -  S:X  a%  U R                  R                  UR
                  5        U R                  XeUR
                  5      u  pUR!                  U5      u  pnnU =R"                  UR
                  -  sl        US-  nUR%                  ['        5       5        UR)                  5       (       d  [+        XR
                  -  U	SS	9$ U R-                  UU5        U R/                  XLXUU5        U R1                  U R"                  U R2                  5        U R5                  5         [7        U5       H  u  nnU(       d  M  U	S-  n	U =R8                  S-  sl        Ub.  UR
                  S:  a
  [;        U/S
9O0 nUR<                  " S0 UD6  Uc  M_  U R8                  U-  S:X  d  Mt  U R?                  5         M     [        X8U	5      (       a  GM  URA                  5         [+        XR
                  -  X5      $ )a4  
Collect experiences and store them into a ``ReplayBuffer``.

:param env: The training environment
:param callback: Callback that will be called at each step
    (and at the beginning and end of the rollout)
:param train_freq: How much experience to collect
    by doing rollouts of current policy.
    Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)``
    or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)``
    with ``<n>`` being an integer greater than 0.
:param action_noise: Action noise that will be used for exploration
    Required for deterministic policy (e.g. TD3). This can also be used
    in addition to the stochastic policy for SAC.
:param learning_starts: Number of steps before learning for the warm-up phase.
:param replay_buffer:
:param log_interval: Log data every ``log_interval`` episodes
:return:
F)r   r   zYou must pass a VecEnvr   z,Should at least collect one step or episode.rK   z7You must use only one env when doing episodic training.T)r   )indicesr   )!r#   set_training_moderL   r   	frequencyr|   unitr   STEPr:   r"   reset_noiseon_rollout_startr   r;   r   rJ   r   update_localsr   on_stepr   _update_info_bufferr   "_update_current_progress_remaining_total_timestepsr   r   r   r   resetr   on_rollout_end)rC   r$   rs   r+   rB   r-   r'   r   num_collected_stepsnum_collected_episodesr   actionsbuffer_actionsr   rewardsrl   r   idxr   kwargss                       rE   r   #OffPolicyAlgorithm.collect_rollouts  s   < 	%%e,6:3#v&&@(@@&##a'W)WW'<<!??&8&=&==x?xx=<<JJ""3<<0!!# '
I_``|| 4 4q 8=PSgSg=gkl=l

&&s||4 '+&9&9/Y\YeYe&f#G .1XXg->*GeU#,,.1$ ""68,##%%$%8<<%GI_sxyy $$UE2 ""='TY[`a33D4F4FH]H]^ MMO&u-	T4*a/*%%*%#/8;q8Hse!4b$**4V4 $/D4E4E4TXY4Y( .E (
I_``^ 	!0<<?AWkkrG   )r   r   r-   r(   r&   r*   r,   r'   r1   r0   r#   rB   r.   r/   r)   r+   r<   )i@B d      g{Gzt?gGz?)rK   rJ   rK   NNNFrK   Nr   Nr   autoFTNFr   FTN)rH   N)T)NTrunF)N   r   TF)NrK   )Nr   N)7__name__
__module____qualname____firstlineno____doc__thnnModule__annotations__r   strtyper   r   floatr   rO   rM   r   r   r   r   r   boolr6   r   SpacerA   rQ   r`   pathlibPathioBufferedIOBaserd   rp   r   r   r}   r   r   r   rj   ndarrayr   r   r   listr   r   r   r   r   __static_attributes____classcell__)rD   s   @rE   r   r      s   1f 55<< %"2=.2<@9=&+26!$)-(."' $"!"' LP;C3c4
++,C3 63;C3 UH_-	C3
 C3 C3 C3 C3 C3 #uS#X./C3 C3 {+C3 &d<&89C3 'tCH~6C3  $C3  !C3"  S#X/#C3$ %C3& "#'C3( )C3* biin%+C3,  -C3. /C30 sm1C32 3C34 5C36  7C38 9C3: "*%V\\0BC0G*H!I;C3 C3J50)#V<uS',,@Q@Q-Q'R <W[ < $(0Cr'8'8890 !0 
	0F #'$( "3
3
  3
 "	3

 3
 3
 
sL 	!3
 3
p #' $(".$..  . 	.
 . ". . 
 .`$C $S $T $ /3	/%/% {+/% 	/%
 
rzz2::%	&/%b20?/#?/ zz?/ rzz4RZZ#889	?/
 

?/ zz?/ DcN#?/ 
?/N /3 &*^l^l ^l 	^l
 $^l {+^l ^l sm^l 
^l ^lrG   )2r  r  r   r   rz   rZ   r   typingr   r   r   r   numpyrj   torchr   	gymnasiumr   #stable_baselines3.common.base_classr	    stable_baselines3.common.buffersr
   r   r   "stable_baselines3.common.callbacksr   stable_baselines3.common.noiser   r   !stable_baselines3.common.policiesr   "stable_baselines3.common.save_utilr   r   %stable_baselines3.common.type_aliasesr   r   r   r   r   r   stable_baselines3.common.utilsr   r    stable_baselines3.common.vec_envr   'stable_baselines3.her.her_replay_bufferr   r   r   r   rG   rE   <module>r     si    	  
    0 0    = ^ ^ ; M 8 I   O 3 C !9AUV D	l D	lrG   