
    h(J                         S SK r S SKrS SKJrJrJr  S SKrS SKr	S SK
Jr  S SKJr  S SKJr  S SKJrJr  S SKJrJr   " S S	\5      rg)
    N)AnyOptionalUnion)spaces)DictReplayBuffer)DictReplayBufferSamples)VecEnvVecNormalize)KEY_TO_GOAL_STRATEGYGoalSelectionStrategyc                     ^  \ rS rSr% Sr\\   \S'          S'S\S\	R                  S\	R                  S\S\\R                  \4   S\S	\S
\S\S\\\4   S\4U 4S jjjrS\\\4   4S jrS\\\4   SS4S jrS\SS4S jrS\\\R2                  4   S\\\R2                  4   S\R2                  S\R2                  S\R2                  S\\\\4      SS4U 4S jjrS\SS4S jrS(S\S\\   S\4S jjr S(S \R2                  S!\R2                  S\\   S\4S" jjr  S(S \R2                  S!\R2                  S\\   S\4S# jjr!S \R2                  S!\R2                  S\R2                  4S$ jr"S)S% jr#S&r$U =r%$ )*HerReplayBuffer   a7  
Hindsight Experience Replay (HER) buffer.
Paper: https://arxiv.org/abs/1707.01495

Replay buffer for sampling HER (Hindsight Experience Replay) transitions.

.. note::

  Compared to other implementations, the ``future`` goal sampling strategy is inclusive:
  the current transition can be used when re-sampling.

:param buffer_size: Max number of element in the buffer
:param observation_space: Observation space
:param action_space: Action space
:param env: The training environment
:param device: PyTorch device
:param n_envs: Number of parallel environments
:param optimize_memory_usage: Enable a memory efficient variant
    Disabled for now (see https://github.com/DLR-RM/stable-baselines3/pull/243#discussion_r531535702)
:param handle_timeout_termination: Handle timeout termination (due to timelimit)
    separately and treat the task as infinite horizon task.
    https://github.com/DLR-RM/stable-baselines3/issues/284
:param n_sampled_goal: Number of virtual transitions to create per real transition,
    by sampling new goals.
:param goal_selection_strategy: Strategy for sampling goals for replay.
    One of ['episode', 'final', 'future']
:param copy_info_dict: Whether to copy the info dictionary and pass it to
    ``compute_reward()`` method.
    Please note that the copy may cause a slowdown.
    False by default.
envbuffer_sizeobservation_spaceaction_spacedevicen_envsoptimize_memory_usagehandle_timeout_terminationn_sampled_goalgoal_selection_strategycopy_info_dictc           
        > [         TU ]  UUUUUUUS9  X@l        Xl        [	        U
[
        5      (       a  [        U
R                  5          U l        OXl        [	        U R                  [        5      (       d   S[        [        5       35       eXl        SSU R                  S-   -  -
  U l        [        R                  " [        U R                   5       Vs/ s H)  n[        U R"                  5       Vs/ s H  n0 PM     snPM+     sn5      U l        [        R&                  " U R                   U R"                  4[        R(                  S9U l        [        R&                  " U R                   U R"                  4[        R(                  S9U l        [        R&                  " U R"                  [        R(                  S9U l        g s  snf s  snf )N)r   r   r   r   z3Invalid goal selection strategy, please use one of    g      ?)dtype)super__init__r   r   
isinstancestrr   lowerr   r   listr   	her_rationparrayranger   r   infoszerosint64ep_start	ep_length_current_ep_start)selfr   r   r   r   r   r   r   r   r   r   r   _	__class__s                a/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/her/her_replay_buffer.pyr   HerReplayBuffer.__init__2   s    	"7'A 	 	
 , -s33+?@W@]@]@_+`D(+B( ((*?
 
 	_@F[A\@]^	_ 
 - cT%8%81%<=>XXtO_O_I`aI`AE$++,>?,>q,>?I`ab

 $"2"2DKK!@Q4#3#3T[["AR!#$++RXX!F  @as   G3G?GGreturnc                 @    U R                   R                  5       nUS	 U$ )zY
Gets state for pickling.

Excludes self.env, as in general Env's may not be pickleable.
r   )__dict__copyr.   states     r1   __getstate__HerReplayBuffer.__getstate__e   s"     ""$%L    r8   Nc                 X    U R                   R                  U5        SU;  d   eSU l        g)ze
Restores pickled state.

User must call ``set_env()`` after unpickling before using.

:param state:
r   N)r5   updater   r7   s     r1   __setstate__HerReplayBuffer.__setstate__p   s+     	U#E!!!r;   c                 @    U R                   b  [        S5      eXl         g)z$
Sets the environment.

:param env:
Nz5Trying to set env of already initialized environment.)r   
ValueError)r.   r   s     r1   set_envHerReplayBuffer.set_env|   s     88TUUr;   obsnext_obsactionrewarddoner(   c                   > [        U R                  5       H  nU R                  U R                  U4   nU R                  U R                  U4   n	U	S:  d  MA  X-   n
[
        R                  " U R                  U
5      U R                  -  nSU R                  X4'   M     U R                  R                  5       U R                  U R                  '   U R                  (       a  X`R                  U R                  '   [        TU ]5  XX4XV5        [        U R                  5       H  nXW   (       d  M  U R                  U5        M!     g )Nr   )r'   r   r+   posr,   r%   aranger   r-   r6   r   r(   r   add_compute_episode_length)r.   rD   rE   rF   rG   rH   r(   env_idxepisode_startepisode_lengthepisode_endepisode_indicesr0   s               r1   rL   HerReplayBuffer.add   s     T[[)G MM$((G*;<M!^^DHHg,=>N!+<"$))DHHk"BTEUEU"U;<78 * #'"8"8"="="?dhh#(JJtxx C64? T[[)G}},,W5 *r;   rN   c                     U R                   U   nU R                  nX2:  a  X0R                  -  n[        R                  " X#5      U R                  -  nX2-
  U R
                  XA4'   U R                  U R                   U'   g)z
Compute and store the episode length for environment with index env_idx

:param env_idx: index of the environment for which the episode length should be computed
N)r-   rJ   r   r%   rK   r,   )r.   rN   rO   rQ   rR   s        r1   rM   'HerReplayBuffer._compute_episode_length   st     ..w7hh& +++K))M?$BRBRR3>3N/0*.((w'r;   
batch_sizec           	         U R                   S:  n[        R                  " U5      (       d  [        S5      e[        R                  " U5      n[        R
                  R                  XASS9n[        R                  " XSR                  5      u  pg[        U R                  U-  5      n[        R                  " Xh/5      u  p[        R                  " Xx/5      u  pU R                  XU5      nU R                  XU5      nUR                  R                  5        Vs0 s H6  nU[         R"                  " UR                  U   UR                  U   45      _M8     nn[         R"                  " UR$                  UR$                  45      nUR&                  R                  5        Vs0 s H6  nU[         R"                  " UR&                  U   UR&                  U   45      _M8     nn[         R"                  " UR(                  UR(                  45      n[         R"                  " UR*                  UR*                  45      n[-        UUUUUS9$ s  snf s  snf )z
Sample elements from the replay buffer.

:param batch_size: Number of element to sample
:param env: Associated VecEnv to normalize the observations/rewards when sampling
:return: Samples
r   zUnable to sample before the end of the first episode. We recommend choosing a value for learning_starts that is greater than the maximum number of timesteps in the environment.T)sizereplaceobservationsactionsnext_observationsdonesrewards)r,   r%   anyRuntimeErrorflatnonzerorandomchoiceunravel_indexshapeintr$   split_get_real_samples_get_virtual_samplesr[   keysthcatr\   r]   r^   r_   r   )r.   rV   r   is_validvalid_indicessampled_indicesbatch_indicesenv_indices
nb_virtualvirtual_batch_indicesreal_batch_indicesvirtual_env_indicesreal_env_indices	real_datavirtual_datakeyr[   r\   r]   r^   r_   s                        r1   sampleHerReplayBuffer.sample   s    >>A%vvho  x0))**=SW*X &(%5%5o~~%V" *45
46HH]L4Y102l0S- **+=QTU	001F]`a
 $00557
7 //4l6O6OPS6TUVV7 	 
 &&)++\-A-ABC $55::<
< 44S9<;Y;YZ];^_``< 	 
 	););<=&&)++\-A-ABC&%/
 	



s   =I=Irq   rr   c                    U R                  U R                  R                  5        VVs0 s H  u  pEXEXSS24   _M     snnU5      nU R                  U R                  R                  5        VVs0 s H  u  pEXEXSS24   _M     snnU5      n[	        U[
        5      (       d   e[	        U[
        5      (       d   eUR                  5        VVs0 s H  u  pEX@R                  U5      _M     nnnUR                  5        VVs0 s H  u  pEX@R                  U5      _M     n	nn[        UU R                  U R                  X4   5      U	U R                  U R                  X4   SU R                  X4   -
  -  5      R                  SS5      U R                  U R                  U R                  X4   R                  SS5      U5      5      S9$ s  snnf s  snnf s  snnf s  snnf )a$  
Get the samples corresponding to the batch and environment indices.

:param batch_indices: Indices of the transitions
:param env_indices: Indices of the environments
:param env: associated gym VecEnv to normalize the
    observations/rewards when sampling, defaults to None
:return: Samples
Nr   rZ   )_normalize_obsr[   itemsr]   r    dictto_torchr   r\   r^   timeoutsreshape_normalize_rewardr_   )
r.   rq   rr   r   rz   rD   obs_	next_obs_r[   r]   s
             r1   ri   !HerReplayBuffer._get_real_samples   s     ""\`\m\m\s\s\u#v\uPXPSC]-J)K$K\u#vx{|''EIE[E[EaEaEcdEcSm!344Ecdfi
	 $%%%%)T****@D

MHC]]3//MEN__EVWEVS--"44EVW&%MM$,,}/I"JK/ --

=56!dmmMLf>g:ghgb!nMM$"8"8mF`9a9i9ijlno9pru"vw

 
	
 $wd NWs   G	
/G
GGc                    U R                   R                  5        VVs0 s H  u  pEXEXSS24   _M     snnnU R                  R                  5        VVs0 s H  u  pEXEXSS24   _M     nnnU R                  (       a%  [        R
                  " U R                  X4   5      nO$[        [        U5      5       Vs/ s H  n0 PM     nnU R                  X5      n	U	WS'   XS'   U R                  c   S5       eU R                  R                  SUS   US   US/S9n
U
S   R                  [        R                  5      n
U R                  XS5      nU R                  Xc5      nUR                  5        VVs0 s H  u  pEX@R!                  U5      _M     nnnUR                  5        VVs0 s H  u  pEX@R!                  U5      _M     nnn[#        UU R!                  U R$                  X4   5      UU R!                  U R&                  X4   SU R(                  X4   -
  -  5      R+                  S	S5      U R!                  U R-                  U
R+                  S	S5      U5      5      S
9$ s  snnf s  snnf s  snf s  snnf s  snnf )aK  
Get the samples, sample new desired goals and compute new rewards.

:param batch_indices: Indices of the transitions
:param env_indices: Indices of the environments
:param env: associated gym VecEnv to normalize the
    observations/rewards when sampling, defaults to None
:return: Samples, with new desired goals and new rewards
Ndesired_goalzcYou must initialize HerReplayBuffer with a VecEnv so it can compute rewards for virtual transitionscompute_rewardachieved_goalr   )indicesr   r~   rZ   )r[   r   r]   r   r6   deepcopyr(   r'   len_sample_goalsr   
env_methodastyper%   float32r   r   r   r\   r^   r   r   r   )r.   rq   rr   r   rz   rD   rE   r(   r/   	new_goalsr_   r[   r]   s                r1   rj   $HerReplayBuffer._get_virtual_samples  sa     HLGXGXG^G^G`aG`83sA566G`aLPLbLbLhLhLjkLjC]:;;LjkMM$**]-G"HIE!&s='9!:;!:AR!:E;&&}B	'N#,  HH 	qp	q  ((%% _%C & 
 !*##BJJ/!!#+&&x5 AD		LHC]]3//LEM^^EUVEUS--"44EUV&%MM$,,}/I"JK/ --

=56!dmmMLf>g:ghgb!nMM$"8"8Q9OQT"UV

 
	
Q bk
 <> MVs   II"8I(/I-"I3c                     U R                   X4   nU R                  X4   nU R                  [        R                  :X  a  US-
  nOU R                  [        R
                  :X  a1  X-
  U R                  -  n[        R                  R                  Xd5      nOXU R                  [        R                  :X  a!  [        R                  R                  SU5      nO[        SU R                   S35      eXS-   U R                  -  nU R                  S   Xr4   $ )z
Sample goals based on goal_selection_strategy.

:param batch_indices: Indices of the transitions
:param env_indices: Indices of the environments
:return: Sampled goals
r   r   z	Strategy z" for sampling goals not supported!r   )r+   r,   r   r   FINALFUTUREr   r%   rc   randintEPISODErA   r]   )r.   rq   rr   batch_ep_startbatch_ep_lengthtransition_indices_in_episodecurrent_indices_in_episodetransition_indicess           r1   r   HerReplayBuffer._sample_goalsc  s     }'AB..)CD''+@+F+FF,;a,?)))-B-I-II +8*HDL\L\)\&,.II,=,=>X,j)))-B-J-JJ,.II,=,=a,Q) y)E)E(FFhijj;LPTP`P``%%o67I7VWWr;   c                    U R                   U R                  :g  R                  5       (       a  [        R                  " S5        [
        R                  " U R                   U R                  :g  5      S    Hl  nSU R                  U R                  S-
  U4'   U R                  [        U5      5        U R                  (       d  MN  SU R                  U R                  S-
  U4'   Mn     gg)z
If called, we assume that the last trajectory in the replay buffer was finished
(and truncate it).
If not called, we assume that we continue the same trajectory (same episode).
zThe last trajectory in the replay buffer will be truncated.
If you are in the same episode as when the replay buffer was saved,
you should use `truncate_last_trajectory=False` to avoid that issue.r   Tr   N)r-   rJ   r`   warningswarnr%   wherer^   rM   rg   r   r   )r.   rN   s     r1   truncate_last_trajectory(HerReplayBuffer.truncate_last_trajectory  s     ""dhh.3355MMW 88D$:$:dhh$FGJ48

488a<01 ,,S\:222;?DMM$((Q,"78 K 6r;   )	r-   r   r   r,   r+   r   r$   r(   r   )autor   FT   futureF)N)r3   N)&__name__
__module____qualname____firstlineno____doc__r   r	   __annotations__rg   r   DictSpacer   rl   r   r!   boolr   r   r   r   r9   r>   rB   r%   ndarrayr#   rL   rM   r
   r   r{   ri   rj   r   r   __static_attributes____classcell__)r0   s   @r1   r   r      s~   @ 
&	 )/&++/EM$1G1G ";;1G ll	1G
 1G biin%1G 1G  $1G %)1G 1G "''<c'A!B1G 1G 1Gf	d38n 	
$sCx. 
T 
	6 	d 	 6#rzz/" 6 sBJJ' 6 

	 6
 

 6 jj 6 DcN# 6 
 6D3s 3t 3"<
 <
8L+A <
Md <
D '+	%
zz%
 ZZ%
 l#	%

 
!%
V '+	B
zzB
 ZZB
 l#	B

 
!B
HX2:: XBJJ XSUS]S] X>@ @r;   r   )r6   r   typingr   r   r   numpyr%   torchrl   	gymnasiumr    stable_baselines3.common.buffersr   %stable_baselines3.common.type_aliasesr    stable_baselines3.common.vec_envr	   r
   -stable_baselines3.her.goal_selection_strategyr   r   r    r;   r1   <module>r      s7      ' '    = I A eI@& I@r;   