
    h                     B   S SK r S SKJrJr  S SKJr  S SKJrJrJ	r	  S SK
rS SKrS SKJr  S SKJrJr  S SKJrJrJrJr  S SKJr  S S	KJr   S SKr " S
 S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\ 5      r" " S S\5      r#g! \ a    Sr NMf = f)    N)ABCabstractmethod)	Generator)AnyOptionalUnion)spaces)get_action_dimget_obs_shape)DictReplayBufferSamplesDictRolloutBufferSamplesReplayBufferSamplesRolloutBufferSamples)
get_device)VecNormalizec                     ^  \ rS rSr% Sr\R                  \S'   \\	S4   \S'     SS\	S\R                  S\R                  S\
\R                  \4   S	\	4
U 4S
 jjjr\S\R"                  S\R"                  4S j5       rS\	4S jrS S jrS S jrS S jrS!S\	S\\   4S jjr\ S!S\R"                  S\\   S\
\\4   4S jj5       rS"S\R"                  S\S\R>                  4S jjr \ S!S\
\R"                  \!\\R"                  4   4   S\\   S\
\R"                  \!\\R"                  4   4   4S jj5       r"\S!S\R"                  S\\   S\R"                  4S jj5       r#Sr$U =r%$ )#
BaseBuffer   a6  
Base class that represent a buffer (rollout or replay)

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
    to which the values will be converted
:param n_envs: Number of parallel environments
observation_space.	obs_shapebuffer_sizeaction_spacedevicen_envsc                    > [         TU ]  5         Xl        X l        X0l        [        U5      U l        [        U5      U l        SU l	        SU l
        [        U5      U l        XPl        g )Nr   F)super__init__r   r   r   r   r   r
   
action_dimposfullr   r   r   )selfr   r   r   r   r   	__class__s         Z/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/common/buffers.pyr   BaseBuffer.__init__*   sZ     	&!2(&'89(6	 (    arrreturnc                     U R                   n[        U5      S:  a  / UQSP7nU R                  SS5      R                  " US   US   -  /USS Q76 $ )z
Swap and then flatten axes 0 (buffer_size) and 1 (n_envs)
to convert shape from [n_steps, n_envs, ...] (when ... is the shape of the features)
to [n_steps * n_envs, ...] (which maintain the order)

:param arr:
:return:
      r      N)shapelenswapaxesreshape)r&   r,   s     r#   swap_and_flattenBaseBuffer.swap_and_flatten>   sY     		u:>eKQKE||Aq!))%(U1X*=Jab	JJr%   c                 T    U R                   (       a  U R                  $ U R                  $ )z)
:return: The current size of the buffer
)r    r   r   r!   s    r#   sizeBaseBuffer.sizeM   s      99###xxr%   c                     [        5       e)z
Add elements to the buffer.
NotImplementedError)r!   argskwargss      r#   addBaseBuffer.addU   s     "##r%   c                 @    [        U6  H  nU R                  " U6   M     g)z.
Add a new batch of transitions to the buffer
N)zipr;   )r!   r9   r:   datas       r#   extendBaseBuffer.extend[   s    
 JDHHdO r%   c                      SU l         SU l        g)z
Reset the buffer.
r   FN)r   r    r3   s    r#   resetBaseBuffer.resetc   s     	r%   
batch_sizeenvc                     U R                   (       a  U R                  OU R                  n[        R                  R                  SX1S9nU R                  XBS9$ )z
:param batch_size: Number of element to sample
:param env: associated gym VecEnv
    to normalize the observations/rewards when sampling
:return:
r   r4   rF   )r    r   r   nprandomrandint_get_samples)r!   rE   rF   upper_bound
batch_indss        r#   sampleBaseBuffer.samplej   sH     +/))d&&YY&&q+&G
   55r%   rO   c                     [        5       e)z)
:param batch_inds:
:param env:
:return:
r7   )r!   rO   rF   s      r#   rM   BaseBuffer._get_samplesu   s     "##r%   arraycopyc                     U(       a  [         R                  " XR                  S9$ [         R                  " XR                  S9$ )a  
Convert a numpy array to a PyTorch tensor.
Note: it copies the data by default

:param array:
:param copy: Whether to copy or not the data (may be useful to avoid changing things
    by reference). This argument is inoperative if the device is not the CPU.
:return:
)r   )thtensorr   	as_tensor)r!   rT   rU   s      r#   to_torchBaseBuffer.to_torch   s-     99U;;77||E++66r%   obsc                 .    Ub  UR                  U 5      $ U $ N)normalize_obs)r\   rF   s     r#   _normalize_obsBaseBuffer._normalize_obs   s    
 ?$$S))
r%   rewardc                 h    Ub.  UR                  U 5      R                  [        R                  5      $ U $ r^   )normalize_rewardastyperJ   float32)rb   rF   s     r#   _normalize_rewardBaseBuffer._normalize_reward   s,    ?''/66rzzBBr%   )	r   r   r   r   r    r   r   r   r   )autor*   r'   Nr^   )T)&__name__
__module____qualname____firstlineno____doc__r	   Space__annotations__tupleintr   rW   r   strr   staticmethodrJ   ndarrayr0   r4   r;   r@   rC   r   r   rP   r   r   r   rM   boolTensorrZ   dictr`   rg   __static_attributes____classcell__r"   s   @r#   r   r      s   	 ||#S#X )/ "<< ll	
 biin%  ( Kbjj KRZZ K Kc $	6 	68L+A 	6 DH$**$+3L+A$	"$88	9$ $7bjj 7 7		 7  '+2::tCO445l# 
rzz4RZZ00	1  "** 8L3I UWU_U_  r%   r   c                     ^  \ rS rSr% Sr\R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'       S"S	\S
\	R                  S\	R                  S\\R                  \4   S\S\S\4U 4S jjjrS\R                  S\R                  S\R                  S\R                  S\R                  S\\\\4      SS4S jrS#S\S\\   S\4U 4S jjjrS#S\R                  S\\   S\4S jjr\S\R6                  R8                  S\R6                  R8                  4S  j5       rS!rU =r$ )$ReplayBuffer   as  
Replay buffer used in off-policy algorithms like SAC/TD3.

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
:param n_envs: Number of parallel environments
:param optimize_memory_usage: Enable a memory efficient variant
    of the replay buffer which reduces by almost a factor two the memory used,
    at a cost of more complexity.
    See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
    and https://github.com/DLR-RM/stable-baselines3/pull/28#issuecomment-637559274
    Cannot be used in combination with handle_timeout_termination.
:param handle_timeout_termination: Handle timeout termination (due to timelimit)
    separately and treat the task as infinite horizon task.
    https://github.com/DLR-RM/stable-baselines3/issues/284
observationsnext_observationsactionsrewardsdonestimeoutsr   r   r   r   r   optimize_memory_usagehandle_timeout_terminationc                   > [         T
U ]  XX4US9  [        X-  S5      U l        [        b  [        R
                  " 5       R                  nU(       a  U(       a  [        S5      eX`l        [        R                  " U R                  U R                  /U R                  Q7UR                  S9U l        U(       dG  [        R                  " U R                  U R                  /U R                  Q7UR                  S9U l        [        R                  " U R                  U R                  U R                   4U R#                  UR                  5      S9U l        [        R                  " U R                  U R                  4[        R&                  S9U l        [        R                  " U R                  U R                  4[        R&                  S9U l        Xpl        [        R                  " U R                  U R                  4[        R&                  S9U l        [        b  U R                  R0                  U R$                  R0                  -   U R(                  R0                  -   U R*                  R0                  -   n	U(       d  XR                  R0                  -  n	U	W:  a*  U	S-  n	US-  n[2        R4                  " SU	S SUS S	35        g g g )
Nr   r*   zpReplayBuffer does not support optimize_memory_usage = True and handle_timeout_termination = True simultaneously.dtype    eAWThis system does not have apparently enough memory to store the complete replay buffer .2fGB > GB)r   r   maxr   psutilvirtual_memory	available
ValueErrorr   rJ   zerosr   r   r   r   r   r   _maybe_cast_dtyper   rf   r   r   r   r   nbyteswarningswarn)r!   r   r   r   r   r   r   r   mem_availabletotal_memory_usager"   s             r#   r   ReplayBuffer.__init__   sA    	V\] {4a8 "113==M !%?H  &;"HHd&6&6%Udnn%U]n]t]tu$%'XXt/?/?.^t~~.^fwf}f}%~D"xxt{{DOO<DDZDZ[g[m[mDn
 xx!1!14;; ?rzzRXXt//=RZZP
 +E'$"2"2DKK!@

S!!((4<<+>+>>ATATTW[WaWaWhWhh  )"&<&<&C&CC"!M1"c)"$%%7$<E-PSATTVX	 2 r%   r\   next_obsactionrb   doneinfosr'   Nc           	         [        U R                  [        R                  5      (       aR  UR	                  U R
                  /U R                  Q75      nUR	                  U R
                  /U R                  Q75      nUR	                  U R
                  U R                  45      n[        R                  " U5      U R                  U R                  '   U R                  (       a>  [        R                  " U5      U R                  U R                  S-   U R                  -  '   O-[        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                   U R                  '   [        R                  " U5      U R"                  U R                  '   U R$                  (       aM  [        R                  " U Vs/ s H  owR'                  SS5      PM     sn5      U R(                  U R                  '   U =R                  S-  sl        U R                  U R                  :X  a  SU l        SU l        g g s  snf )Nr*   TimeLimit.truncatedFTr   )
isinstancer   r	   Discreter/   r   r   r   rJ   rT   r   r   r   r   r   r   r   r   r   getr   r    )r!   r\   r   r   rb   r   r   infos           r#   r;   ReplayBuffer.add   s    d,,foo>>++t{{<T^^<=C''(Ft~~(FGH doo >? ')hhsm$((#%%CE88HCUDtxx!|t/?/??@/1xx/AD""488,!#&!1TXX!#&!1TXX!xx~

488**&(hhch/ich[_9NPU0Vch/i&jDMM$((#A88t'''DIDH ( 0js   8I0rE   rF   c                 Z  > U R                   (       d  [        TU ]	  XS9$ U R                  (       aD  [        R
                  R                  SU R                  US9U R                  -   U R                  -  nO)[        R
                  R                  SU R                  US9nU R                  X2S9$ )az  
Sample elements from the replay buffer.
Custom sampling when using memory efficient variant,
as we should not sample the element with index `self.pos`
See https://github.com/DLR-RM/stable-baselines3/pull/28#issuecomment-637559274

:param batch_size: Number of element to sample
:param env: associated gym VecEnv
    to normalize the observations/rewards when sampling
:return:
rE   rF   r*   rH   r   rI   )
r   r   rP   r    rJ   rK   rL   r   r   rM   )r!   rE   rF   rO   r"   s       r#   rP   ReplayBuffer.sample  s     ))7>Z>AA 99))++At/?/?j+QTXT\T\\`d`p`ppJ**1dhhZ*HJ   55r%   rO   c           
         [         R                  R                  SU R                  [	        U5      4S9nU R
                  (       a5  U R                  U R                  US-   U R                  -  US S 24   U5      nO#U R                  U R                  XS S 24   U5      nU R                  U R                  XS S 24   U5      U R                  XS S 24   UU R                  X4   SU R                  X4   -
  -  R                  SS5      U R                  U R                  X4   R                  SS5      U5      4n[!        [#        [%        U R&                  U5      5      6 $ )Nr   highr4   r*   )rJ   rK   rL   r   r-   r   r`   r   r   r   r   r   r   r/   rg   r   r   rr   maprZ   )r!   rO   rF   env_indicesr   r?   s         r#   rM   ReplayBuffer._get_samples3  sH   ii''3z?BT'U%%**4+<+<j1nPTP`P`=`bmop=p+qsvwH**4+A+A*[\B\+]_bcH  1 1*12L MsSLL!34 ZZ
/0AjF]8^4^_hhikmno""4<<
0G#H#P#PQSUV#WY\]
 #E#dmmT*B$CDDr%   r   c                 N    U [         R                  :X  a  [         R                  $ U $ )z
Cast `np.float64` action datatype to `np.float32`,
keep the others dtype unchanged.
See GH#1572 for more information.

:param dtype: The original action space dtype
:return: ``np.float32`` if the dtype was float64,
    the original dtype otherwise.
)rJ   float64rf   r   s    r#   r   ReplayBuffer._maybe_cast_dtypeG  s     BJJ::r%   r   r   r   r    r   r   r   r   r   r   r   ri   r*   FTr^   ) rk   rl   rm   rn   ro   rJ   rv   rq   rs   r	   rp   r   rW   r   rt   rw   r   listry   r   r;   r   r   r   rP   rM   ru   typing	DTypeLiker   rz   r{   r|   s   @r#   r~   r~      s   & **zz!ZZZZ::jj )/&++/<< "<<< ll	<
 biin%< <  $< %)< <|$ZZ$ **$ 

	$
 

$ jj$ DcN#$ 
$L6 68L+A 6M` 6 6,Erzz E8N EZm E( !4!4 9L9L  r%   r~   c                     ^  \ rS rSr% Sr\R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S'   \R                  \S	'   \R                  \S
'       S&S\S\	R                  S\	R                  S\\R                  \4   S\S\S\4U 4S jjjrS'U 4S jjrS\R$                  S\R                  SS4S jrS\R                  S\R                  S\R                  S\R                  S\R$                  S\R$                  SS4S jrS(S \\   S\\SS4   4S! jjr S(S"\R                  S#\\   S\4S$ jjrS%rU =r$ ))RolloutBufferiW  a  
Rollout buffer used in on-policy algorithms like A2C/PPO.
It corresponds to ``buffer_size`` transitions collected
using the current policy.
This experience will be discarded after the policy update.
In order to use PPO objective, we also store the current value of each state
and the log probability of each taken action.

The term rollout here refers to the model-free notion and should not
be used with the concept of rollout used in model-based RL or planning.
Hence, it is only involved in policy and value function training but not action selection.

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    Equivalent to classic advantage when set to 1.
:param gamma: Discount factor
:param n_envs: Number of parallel environments
r   r   r   
advantagesreturnsepisode_starts	log_probsvaluesr   r   r   r   
gae_lambdagammar   c                 j   > [         TU ]  XX4US9  XPl        X`l        SU l        U R                  5         g )Nr   F)r   r   r   r   generator_readyrC   	r!   r   r   r   r   r   r   r   r"   s	           r#   r   RolloutBuffer.__init__w  s5     	V\]$
$

r%   r'   Nc                 @  > [         R                  " U R                  U R                  /U R                  Q7[         R
                  S9U l        [         R                  " U R                  U R                  U R                  4[         R
                  S9U l        [         R                  " U R                  U R                  4[         R
                  S9U l	        [         R                  " U R                  U R                  4[         R
                  S9U l
        [         R                  " U R                  U R                  4[         R
                  S9U l        [         R                  " U R                  U R                  4[         R
                  S9U l        [         R                  " U R                  U R                  4[         R
                  S9U l        [         R                  " U R                  U R                  4[         R
                  S9U l        SU l        [         TU ]E  5         g Nr   F)rJ   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   rC   )r!   r"   s    r#   rC   RolloutBuffer.reset  sS   HHd&6&6%Udnn%U]_]g]ghxx!1!14;; PXZXbXbcxx!1!14;; ?rzzRxx!1!14;; ?rzzR hh(8(8$++'FbjjYhh 0 0$++>bjjQ4#3#3T[["AT((D$4$4dkk#B"**U$r%   last_valuesr   c                    UR                  5       R                  5       R                  5       R                  5       nSn[	        [        U R                  5      5       H  nX@R                  S-
  :X  a%  SUR                  [        R                  5      -
  nUnO'SU R                  US-      -
  nU R                  US-      nU R                  U   U R                  U-  U-  -   U R                  U   -
  nXpR                  U R                  -  U-  U-  -   nX0R                  U'   M     U R                  U R                  -   U l        g)aR  
Post-processing step: compute the lambda-return (TD(lambda) estimate)
and GAE(lambda) advantage.

Uses Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
to compute the advantage. To obtain Monte-Carlo advantage estimate (A(s) = R - V(S))
where R is the sum of discounted reward with value bootstrap
(because we don't always have full episode), set ``gae_lambda=1.0`` during initialization.

The TD(lambda) estimator has also two special cases:
- TD(1) is Monte-Carlo estimate (sum of discounted rewards)
- TD(0) is one-step estimate with bootstrapping (r_t + gamma * v(s_{t+1}))

For more information, see discussion in https://github.com/DLR-RM/stable-baselines3/pull/375.

:param last_values: state value estimation for the last step (one for each env)
:param dones: if the last step was a terminal step (one bool for each env).
r   r*         ?N)clonecpunumpyflattenreversedranger   re   rJ   rf   r   r   r   r   r   r   r   )r!   r   r   last_gae_lamstepnext_non_terminalnext_valuesdeltas           r#   compute_returns_and_advantage+RolloutBuffer.compute_returns_and_advantage  s   ( "'')--/557??AU4#3#345D''!++$'%,,rzz*B$B!)$'$*=*=dQh*G$G!"kk$(3LL&k)ADU)UUX\XcXcdhXiiE ::#?BS#SVb#bbL$0OOD! 6 4r%   r\   r   rb   episode_startvaluelog_probc                 J   [        UR                  5      S:X  a  UR                  SS5      n[        U R                  [
        R                  5      (       a)  UR                  U R                  /U R                  Q75      nUR                  U R                  U R                  45      n[        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                   U R                  '   UR#                  5       R%                  5       R'                  5       R)                  5       U R*                  U R                  '   UR#                  5       R%                  5       R'                  5       U R,                  U R                  '   U =R                  S-  sl        U R                  U R.                  :X  a  SU l        gga  
:param obs: Observation
:param action: Action
:param reward:
:param episode_start: Start of episode signal.
:param value: estimated value of the current state
    following the current policy.
:param log_prob: log probability of the action
    following the current policy.
r   r   r*   TN)r-   r,   r/   r   r   r	   r   r   r   r   rJ   rT   r   r   r   r   r   r   r   r   r   r   r   r   r    )r!   r\   r   rb   r   r   r   s          r#   r;   RolloutBuffer.add  ss   & x~~!#''A.H d,,foo>>++t{{<T^^<=C doo >?&(hhsm$((#!#&!1TXX!#&!1TXX(*(?DHH% % 1 1 3 9 9 ; C C EDHH#+>>#3#7#7#9#?#?#Atxx A88t'''DI (r%   rE   c              #   0  #    U R                   (       d   S5       e[        R                  R                  U R                  U R
                  -  5      nU R                  (       d?  / SQnU H.  nU R                  U R                  U   5      U R                  U'   M0     SU l        Uc  U R                  U R
                  -  nSnXPR                  U R
                  -  :  a:  U R                  X%XQ-    5      v   XQ-  nXPR                  U R
                  -  :  a  M9  g g 7f)N )r   r   r   r   r   r   Tr   )
r    rJ   rK   permutationr   r   r   r0   __dict__rM   )r!   rE   indices_tensor_namesrX   	start_idxs         r#   r   RolloutBuffer.get  s     yy"y))''(8(84;;(FG##M ((,(=(=dmmF>S(Tf% (#'D  ))DKK7J	**T[[88##G	8N$OPP#I **T[[88s   DDDrO   rF   c                 h   U R                   U   U R                  U   U R                  U   R                  5       U R                  U   R                  5       U R
                  U   R                  5       U R                  U   R                  5       4n[        [        [        U R                  U5      5      6 $ r^   )r   r   r   r   r   r   r   r   rr   r   rZ   )r!   rO   rF   r?   s       r#   rM   RolloutBuffer._get_samples  s     j)LL$KK
#++-NN:&..0OOJ'//1LL$,,.
 $U3t}}d+C%DEEr%   r   r   r   r    r   r   r   r   r   r   r   r   ri   r*   Gz?r*   rj   r^   )rk   rl   rm   rn   ro   rJ   rv   rq   rs   r	   rp   r   rW   r   rt   floatr   rC   rx   r   r;   r   r   r   r   r   rM   rz   r{   r|   s   @r#   r   r   W  s   , **ZZZZ

ZZJJzzJJ )/ "<< ll	
 biin%     
#5 #52:: #5Z^ #5J'ZZ' 

' 

	'
 zz' yy' ))' 
'R$hsm $yAUW[]aAa7b $< '+FJJF l#F 
	F Fr%   r   c                   8  ^  \ rS rSr% Sr\R                  \S'   \\	\
\S4   4   \S'   \\	\R                  4   \S'   \\	\R                  4   \S'       SS\S\R                  S	\R                  S
\\R"                  \	4   S\S\S\4U 4S jjjrS\\	\R                  4   S\\	\R                  4   S\R                  S\R                  S\R                  S\\\	\4      SS4S jr SS\S\\   S\4U 4S jjjr SS\R                  S\\   S\4S jjrSrU =r$ ) DictReplayBufferi  a  
Dict Replay buffer used in off-policy algorithms like SAC/TD3.
Extends the ReplayBuffer to use dictionary observations

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
:param n_envs: Number of parallel environments
:param optimize_memory_usage: Enable a memory efficient variant
    Disabled for now (see https://github.com/DLR-RM/stable-baselines3/pull/243#discussion_r531535702)
:param handle_timeout_termination: Handle timeout termination (due to timelimit)
    separately and treat the task as infinite horizon task.
    https://github.com/DLR-RM/stable-baselines3/issues/284
r   .r   r   r   r   r   r   r   r   r   c           
        > [         [        U ]  XX4US9  [        U R                  [
        5      (       d   S5       e[        X-  S5      U l        [        b  [        R                  " 5       R                  nU(       a   S5       eX`l        U R                  R                  5        V	V
s0 s H@  u  pU	[        R                  " U R                  U R                  /U
Q7X)   R                   S9_MB     sn
n	U l        U R                  R                  5        V	V
s0 s H@  u  pU	[        R                  " U R                  U R                  /U
Q7X)   R                   S9_MB     sn
n	U l        [        R                  " U R                  U R                  U R&                  4U R)                  UR                   5      S9U l        [        R                  " U R                  U R                  4[        R,                  S9U l        [        R                  " U R                  U R                  4[        R,                  S9U l        Xpl        [        R                  " U R                  U R                  4[        R,                  S9U l        [        b  SnU R"                  R                  5        H  u  pXR6                  -  nM     XR*                  R6                  -   U R.                  R6                  -   U R0                  R6                  -   nU(       d7  SnU R"                  R                  5        H  u  pXR6                  -  nM     X-  nUW:  a*  US-  nUS-  n[8        R:                  " SUS	 S
US	 S35        g g g s  sn
n	f s  sn
n	f )Nr   z6DictReplayBuffer must be used with Dict obs space onlyr*   z7DictReplayBuffer does not support optimize_memory_usager   r   r   r   r   r   r   )r   r~   r   r   r   ry   r   r   r   r   r   r   itemsrJ   r   r   r   r   r   r   r   r   rf   r   r   r   r   r   r   r   )r!   r   r   r   r   r   r   r   r   key
_obs_shape
obs_nbytes_r\   r   next_obs_nbytesr"   s                   r#   r   DictReplayBuffer.__init__"  s    	lD*;<hn*o$..$//i1ii/{4a8 "113==M(c*cc( &;" $(>>#7#7#9
#9 4++T[[F:FN_NdNjNjkk#9
 $(>>#7#7#9"
#9 4++T[[F:FN_NdNjNjkk#9"

 xxt{{DOO<DDZDZ[g[m[mDn
 xx!1!14;; ?rzzRXXt//=RZZP
 +E'$"2"2DKK!@

SJ++113jj(
 4 )3\\5H5H(H4<<K^K^(^aeakakarar(r("#"//557FA#zz1O 8"5"!M1"c)"$%%7$<E-PSATTVX	 2 )
"
s   %AM&AM,r\   r   r   rb   r   r   r'   Nc           	         U R                   R                  5        H  n[        U R                  R                  U   [        R
                  5      (       a0  X   R                  U R                  4U R                  U   -   5      X'   [        R                  " X   5      U R                   U   U R                  '   M     U R                  R                  5        H  n[        U R                  R                  U   [        R
                  5      (       a0  X'   R                  U R                  4U R                  U   -   5      X''   [        R                  " X'   5      U R                  U   U R                  '   M     UR                  U R                  U R                  45      n[        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                  U R                  '   [        R                  " U5      U R                   U R                  '   U R"                  (       aM  [        R                  " U Vs/ s H  oR%                  SS5      PM     sn5      U R&                  U R                  '   U =R                  S-  sl        U R                  U R(                  :X  a  SU l        SU l        g g s  snf )Nr   Fr*   Tr   )r   keysr   r   r	   r   r/   r   r   rJ   rT   r   r   r   r   r   r   r   r   r   r   r    )	r!   r\   r   r   rb   r   r   r   r   s	            r#   r;   DictReplayBuffer.addc  s    $$))+C $0077<fooNN8++T[[NT^^C=P,PQ/1xx/ADc"488, , ))..0C$0077<fooNN ( 5 5t{{nt~~VYGZ6Z [46HHX]4KD""3'1 1 doo >?!#&!1TXX!#&!1TXX!xx~

488**&(hhch/ich[_9NPU0Vch/i&jDMM$((#A88t'''DIDH ( 0js   J>rE   rF   c                 &   > [         [        U ]  XS9$ )z
Sample elements from the replay buffer.

:param batch_size: Number of element to sample
:param env: associated gym VecEnv
    to normalize the observations/rewards when sampling
:return:
r   )r   r~   rP   )r!   rE   rF   r"   s      r#   rP   DictReplayBuffer.sample  s     \4/:/OOr%   rO   c                 (   [         R                  R                  SU R                  [	        U5      4S9nU R                  U R                  R                  5        VVs0 s H  u  pEXEXS S 24   _M     snnU5      nU R                  U R                  R                  5        VVs0 s H  u  pEXEXS S 24   _M     snnU5      n[        U[        5      (       d   e[        U[        5      (       d   eUR                  5        VVs0 s H  u  pEX@R                  U5      _M     nnnUR                  5        VVs0 s H  u  pEX@R                  U5      _M     n	nn[        UU R                  U R                  X4   5      U	U R                  U R                  X4   SU R                  X4   -
  -  5      R!                  SS5      U R                  U R#                  U R$                  X4   R!                  SS5      U5      5      S9$ s  snnf s  snnf s  snnf s  snnf )Nr   r   r*   r   )r   r   r   r   r   )rJ   rK   rL   r   r-   r`   r   r   r   r   ry   rZ   r   r   r   r   r/   rg   r   )
r!   rO   rF   r   r   r\   obs_	next_obs_r   r   s
             r#   rM   DictReplayBuffer._get_samples  s    ii''3z?BT'U ""Y]YjYjYpYpYr#sYrXSCZa-G)H$HYr#suxy''BFBXBXB^B^B`aB`hcSjq011B`acf
	 $%%%%)T****@D

MHC]]3//MEN__EVWEVS--"44EVW&%MM$,,z/F"GH/ --

:+B Cq4==YcYpKqGq rs{{A MM$"8"8jF]9^9f9fgikl9mor"st

 
	
 $ta NWs   G<
"H
H4Hr   r   r^   )rk   rl   rm   rn   ro   r	   Dictrq   ry   rt   rr   rs   rJ   rv   rp   r   rW   r   rw   r   r   r   r;   r   r   r   rP   rM   rz   r{   r|   s   @r#   r   r     s     {{"CsCx())sBJJ''CO,, )/&++/?? ";;? ll	?
 biin%? ?  $? %)? ?B##rzz/"# sBJJ'# 

	#
 

# jj# DcN## 
#P '+PP l#P 
!	P P$ '+
JJ
 l#
 
!	
 
r%   r   c                     ^  \ rS rSr% Sr\R                  \S'   \\	\
\S4   4   \S'   \\	\R                  4   \S'       SS\S\R                  S\R                  S	\\R"                  \	4   S
\S\S\4U 4S jjjrSU 4S jjrS\\	\R                  4   S\R                  S\R                  S\R                  S\R*                  S\R*                  SS4S jr S S\\   S\\SS4   4S jjr S S\R                  S\\   S\4S jjrSrU =r$ )!DictRolloutBufferi  a  
Dict Rollout buffer used in on-policy algorithms like A2C/PPO.
Extends the RolloutBuffer to use dictionary observations

It corresponds to ``buffer_size`` transitions collected
using the current policy.
This experience will be discarded after the policy update.
In order to use PPO objective, we also store the current value of each state
and the log probability of each taken action.

The term rollout here refers to the model-free notion and should not
be used with the concept of rollout used in model-based RL or planning.
Hence, it is only involved in policy and value function training but not action selection.

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
    Equivalent to Monte-Carlo advantage estimate when set to 1.
:param gamma: Discount factor
:param n_envs: Number of parallel environments
r   .r   r   r   r   r   r   r   r   c                    > [         [        U ]  XX4US9  [        U R                  [
        5      (       d   S5       eXPl        X`l        SU l        U R                  5         g )Nr   z7DictRolloutBuffer must be used with Dict obs space onlyF)
r   r   r   r   r   ry   r   r   r   rC   r   s	           r#   r   DictRolloutBuffer.__init__  sT     	mT+KLio+p$..$//j1jj/$
$

r%   r'   Nc                   > 0 U l         U R                  R                  5        HN  u  p[        R                  " U R
                  U R                  /UQ7[        R                  S9U R                   U'   MP     [        R                  " U R
                  U R                  U R                  4[        R                  S9U l	        [        R                  " U R
                  U R                  4[        R                  S9U l
        [        R                  " U R
                  U R                  4[        R                  S9U l        [        R                  " U R
                  U R                  4[        R                  S9U l        [        R                  " U R
                  U R                  4[        R                  S9U l        [        R                  " U R
                  U R                  4[        R                  S9U l        [        R                  " U R
                  U R                  4[        R                  S9U l        SU l        ["        [$        U ]O  5         g r   )r   r   r   rJ   r   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   rC   )r!   r   obs_input_shaper"   s      r#   rC   DictRolloutBuffer.reset  s}   $(NN$8$8$: C%'XXt/?/?._._gigqgq%rDc" %;xx!1!14;; PXZXbXbcxx!1!14;; ?rzzRxx!1!14;; ?rzzR hh(8(8$++'FbjjYhh 0 0$++>bjjQ4#3#3T[["AT((D$4$4dkk#B"**U$mT(*r%   r\   r   rb   r   r   r   c                    [        UR                  5      S:X  a  UR                  SS5      nU R                  R	                  5        H  n[
        R                  " X   5      n[        U R                  R                  U   [        R                  5      (       a,  UR                  U R                  4U R                  U   -   5      nXR                  U   U R                  '   M     UR                  U R                  U R                  45      n[
        R                  " U5      U R                  U R                  '   [
        R                  " U5      U R                   U R                  '   [
        R                  " U5      U R"                  U R                  '   UR%                  5       R'                  5       R)                  5       R+                  5       U R,                  U R                  '   UR%                  5       R'                  5       R)                  5       U R.                  U R                  '   U =R                  S-  sl        U R                  U R0                  :X  a  SU l        ggr   )r-   r,   r/   r   r   rJ   rT   r   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    )	r!   r\   r   rb   r   r   r   r   r   s	            r#   r;   DictRolloutBuffer.add  s   & x~~!#''A.H$$))+C88CH%D $0077<fooNN||T[[NT^^C5H$HI/3c"488, , doo >?!#&!1TXX!#&!1TXX(*(?DHH% % 1 1 3 9 9 ; C C EDHH#+>>#3#7#7#9#?#?#Atxx A88t'''DI (r%   rE   c              #     #    U R                   (       d   S5       e[        R                  R                  U R                  U R
                  -  5      nU R                  (       d  U R                  R                  5        H#  u  p4U R                  U5      U R                  U'   M%     / SQnU H.  nU R                  U R                  U   5      U R                  U'   M0     SU l        Uc  U R                  U R
                  -  nSnXpR                  U R
                  -  :  a:  U R                  X'Xq-    5      v   Xq-  nXpR                  U R
                  -  :  a  M9  g g 7f)Nr   )r   r   r   r   r   Tr   )r    rJ   rK   r   r   r   r   r   r   r0   r   rM   )r!   rE   r   r   r\   r   rX   r   s           r#   r   DictRolloutBuffer.get"  s$     yy"y))''(8(84;;(FG## --335)-)>)>s)C!!#& 6 XM'(,(=(=dmmF>S(Tf% (#'D  ))DKK7J	**T[[88##G	8N$OPP#I **T[[88s   EEErO   rF   c                 .   [        U R                  R                  5        VVs0 s H  u  p4X0R                  XA   5      _M     snnU R                  U R                  U   5      U R                  U R
                  U   R                  5       5      U R                  U R                  U   R                  5       5      U R                  U R                  U   R                  5       5      U R                  U R                  U   R                  5       5      S9$ s  snnf )N)r   r   
old_valuesold_log_probr   r   )
r   r   r   rZ   r   r   r   r   r   r   )r!   rO   rF   r   r\   s        r#   rM   DictRolloutBuffer._get_samples<  s    
 (PTPaPaPgPgPijPi*3#}}S_==PijMM$,,z":;}}T[[%<%D%D%FGt~~j'A'I'I'KL}}T__Z%@%H%H%JKMM$,,z":"B"B"DE
 	
js   D
r   r   rj   r^   )rk   rl   rm   rn   ro   r	   r  rq   ry   rt   rr   rs   rJ   rv   rp   r   rW   r   r   r   rC   rx   r;   r   r   r   r   r   rM   rz   r{   r|   s   @r#   r  r    s   0 {{"CsCx())sBJJ'' )/ ";; ll	
 biin%    (+)#rzz/") 

) 

	)
 zz) yy) ))) 
)Z %)$SM$ 
+T47	8$: '+
JJ
 l#
 
"	
 
r%   r  c                   v   ^  \ rS rSrSrSSS.S\S\4U 4S jjjrSS	\R                  S
\
\   S\4S jjrSrU =r$ )NStepReplayBufferiK  a  
Replay buffer used for computing n-step returns in off-policy algorithms like SAC/DQN.

The n-step return combines multiple steps of future rewards,
discounted by the discount factor gamma.
This can help improve sample efficiency and credit assignment.

This implementation uses the same storage space as a normal replay buffer,
and NumPy vectorized operations at sampling time to efficiently compute the
n-step return, without requiring extra memory.

This implementation is inspired by:
- https://github.com/younggyoseo/FastTD3
- https://github.com/DLR-RM/stable-baselines3/pull/81

It avoids potential issues such as:
- https://github.com/younggyoseo/FastTD3/issues/6

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param device: PyTorch device
:param n_envs: Number of parallel environments
:param optimize_memory_usage: Not supported
:param handle_timeout_termination: Handle timeout termination (due to timelimit)
    separately and treat the task as infinite horizon task.
    https://github.com/DLR-RM/stable-baselines3/issues/284
:param n_steps: Number of steps to accumulate rewards for n-step returns
:param gamma: Discount factor for future rewards
r)   r   )n_stepsr   r  r   c                v   > [         TU ]  " U0 UD6  Xl        X l        U R                  (       a  [        S5      eg )Nz<NStepReplayBuffer doesn't support optimize_memory_usage=True)r   r   r  r   r   r8   )r!   r  r   r9   r:   r"   s        r#   r   NStepReplayBuffer.__init__k  s9    $)&)
%%%&dee &r%   rO   rF   r'   c           
      v   [         R                  R                  SU R                  UR                  S9nU R
                  S-
  nU R                  U   R                  5       n[         R                  " U[         R                  " U R                  U   5      5      U R                  U'   [         R                  " U R                  5      R                  SS5      nUSS2S4   U-   U R                  -  nU R                  U R                   XsSS2S4   4   U5      nU R                  XsSS2S4   4   n	U R                  XsSS2S4   4   n
[         R                  " X5      nUR#                  SS9nUR%                  SS9n[         R&                  " XU R                  S-
  5      n[         R                  " U R                  5      R                  SS5      USS2S4   :*  nU R(                  UR+                  SSS9R-                  [         R.                  5      -  nU R(                  [         R                  " U R                  [         R.                  S	9R                  SS5      -  nUU-  U-  nUR+                  SSS9nX-   U R                  -  nU R1                  U R2                  UU4   U5      nU R                  UU4   SS2S4   R-                  [         R.                  5      nU R                  UU4   SS2S4   R-                  [         R.                  5      nUS
U-
  -  nXPR                  U'   U R1                  U R4                  X4   U5      nU R6                  X4   n[9        U R;                  U5      U R;                  U5      U R;                  U5      U R;                  U5      U R;                  U5      U R;                  U5      S9$ )a   
Sample a batch of transitions and compute n-step returns.

For each sampled transition, the method computes the cumulative discounted reward over
the next `n_steps`, properly handling episode termination and timeouts.
The next observation and done flag correspond to the last transition in the computed n-step trajectory.

:param batch_inds: Indices of samples to retrieve
:param env: Optional VecNormalize environment for normalizing observations/rewards
:return: A batch of samples with n-step returns and corresponding observations/actions
r   rH   r*   r   N)axisT)r  keepdimsr   r   )r   r   r   r   r   	discounts)rJ   rK   rL   r   r,   r   r   rU   
logical_orlogical_notr   aranger  r/   r   rg   r   argmaxanywherer   sumre   rf   r`   r   r   r   r   rZ   )r!   rO   rF   r   last_valid_indexoriginal_timeout_valuesstepsr   rewards_seq	dones_seqtruncated_seqdone_or_truncateddone_idxhas_done_or_truncatedmasktarget_q_discountsr  discounted_rewardsn_step_returnslast_indicesr   
next_donesnext_timeoutsfinal_donesr\   r   s                             r#   rM   NStepReplayBuffer._get_samplesr  sc    ii''4;;Z=M=M'N  88a<"&--0@"A"F"F"H*,--8OQSQ_Q_`d`j`jk{`|Q}*~&' 		$,,'//26ag&.$2B2BB ,,T\\'qRVwCW:W-XZ]^JJwAtG(<<=	g1d7/C&CD MM)C$+++3 1 5 51 5 =881T\\A=MNyy&..q"5!T'9JJ "ZZ488T8+J+Q+QRTR\R\+]] JJ"))DLL

"K"S"STUWY"ZZ	(94t;+//Q/F #-1A1AA&&t'='=lK>W'XZ]^ZZk 9:1d7CJJ2::V
lK&?@DIPPQSQ[Q[\ C-$78 +B&' !!$"3"3J4K"LcR,,z67"s+MM'*"mmH5--,MM.1mm$67
 	
r%   )r   r  r^   )rk   rl   rm   rn   ro   rs   r   r   rJ   rv   r   r   r   rM   rz   r{   r|   s   @r#   r  r  K  sY    > ./t fs fu f fD
rzz D
8N D
Zm D
 D
r%   r  )$r   abcr   r   collections.abcr   r   r   r   r   r   rJ   torchrW   	gymnasiumr	   &stable_baselines3.common.preprocessingr
   r   %stable_baselines3.common.type_aliasesr   r   r   r   stable_baselines3.common.utilsr    stable_baselines3.common.vec_envr   r   ImportErrorr   r~   r   r   r  r   r%   r#   <module>r>     s     # % ' '    P  6 9
@ @Fv: vrrFJ rFji
| i
XP
 P
fk
 k
i  Fs   B BB