
    h              	          S r SSKJr  SSKJrJr  SSKrSSKr	SSK
JrJr  SSKJr  S/r " S S\	R                   \\\\4   \	R"                  R$                  5      rg)	zA collection of wrappers for modifying the reward with an internal state.

* ``NormalizeReward`` - Normalizes the rewards to a mean and standard deviation
    )annotations)AnySupportsFloatN)ActTypeObsType)RunningMeanStdNormalizeRewardc                     ^  \ rS rSrSr  S     S	S jjr\S
S j5       r\R                  SS j5       r    SU 4S jjr	Sr
U =r$ )r	      a  Normalizes immediate rewards such that their exponential moving average has an approximately fixed variance.

The property `_update_running_mean` allows to freeze/continue the running mean calculation of the reward
statistics. If `True` (default), the `RunningMeanStd` will get updated every time `self.normalize()` is called.
If False, the calculated statistics are used but not updated anymore; this may be used during evaluation.

A vector version of the wrapper exists :class:`gymnasium.wrappers.vector.NormalizeReward`.

Note:
    In v0.27, NormalizeReward was updated as the forward discounted reward estimate was incorrectly computed in Gym v0.25+.
    For more detail, read [#3154](https://github.com/openai/gym/pull/3152).

Note:
    The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly
    instantiated or the policy was changed recently.

Example without the normalize reward wrapper:
    >>> import numpy as np
    >>> import gymnasium as gym
    >>> env = gym.make("MountainCarContinuous-v0")
    >>> _ = env.reset(seed=123)
    >>> _ = env.action_space.seed(123)
    >>> episode_rewards = []
    >>> terminated, truncated = False, False
    >>> while not (terminated or truncated):
    ...     observation, reward, terminated, truncated, info = env.step(env.action_space.sample())
    ...     episode_rewards.append(reward)
    ...
    >>> env.close()
    >>> np.var(episode_rewards)
    np.float64(0.0008876301247721108)

Example with the normalize reward wrapper:
    >>> import numpy as np
    >>> import gymnasium as gym
    >>> env = gym.make("MountainCarContinuous-v0")
    >>> env = NormalizeReward(env, gamma=0.99, epsilon=1e-8)
    >>> _ = env.reset(seed=123)
    >>> _ = env.action_space.seed(123)
    >>> episode_rewards = []
    >>> terminated, truncated = False, False
    >>> while not (terminated or truncated):
    ...     observation, reward, terminated, truncated, info = env.step(env.action_space.sample())
    ...     episode_rewards.append(reward)
    ...
    >>> env.close()
    >>> np.var(episode_rewards)
    np.float64(0.010162116476634746)

Change logs:
 * v0.21.0 - Initially added
 * v1.0.0 - Add `update_running_mean` attribute to allow disabling of updating the running mean / standard
c                   [         R                  R                  R                  XUS9  [         R                  R                  X5        [        SS9U l        [        R                  " S/5      U l	        X l
        X0l        SU l        g)a6  This wrapper will normalize immediate rewards s.t. their exponential moving average has an approximately fixed variance.

Args:
    env (env): The environment to apply the wrapper
    epsilon (float): A stability parameter
    gamma (float): The discount factor that is used in the exponential moving average.
)gammaepsilon )shapeg        TN)gymutilsRecordConstructorArgs__init__Wrapperr   
return_rmsnparraydiscounted_rewardr   r   _update_running_mean)selfenvr   r   s       \/home/james-whalen/.local/lib/python3.13/site-packages/gymnasium/wrappers/stateful_reward.pyr   NormalizeReward.__init__M   sc     			''00G0TT'(r2!#3%
$(!    c                    U R                   $ )zRProperty to freeze/continue the running mean calculation of the reward statistics.r   )r   s    r   update_running_mean#NormalizeReward.update_running_meanc   s     (((r   c                    Xl         g)z[Sets the property to freeze/continue the running mean calculation of the reward statistics.Nr!   )r   settings     r   r"   r#   h   s
     %,!r   c                t  > [         TU ]  U5      u  p#pEnU R                  U R                  -  SU-
  -  [	        U5      -   U l        U R
                  (       a%  U R                  R                  U R                  5        U[        R                  " U R                  R                  U R                  -   5      -  nX'XEU4$ )z?Steps through the environment, normalizing the reward returned.   )superstepr   r   floatr   r   updater   sqrtvarr   )	r   actionobsreward
terminated	truncatedinfonormalized_reward	__class__s	           r   r)   NormalizeReward.stepm   s     497<3G0ZD "&!7!7$**!D
N"
&M" $$OO""4#9#9: #RWWT__-@-@4<<-O%PPzdBBr   )r   r   r   r   r   )gGz?g:0yE>)r   zgym.Env[ObsType, ActType]r   r*   r   r*   )returnbool)r%   r8   )r.   r   r7   z9tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]])__name__
__module____qualname____firstlineno____doc__r   propertyr"   setterr)   __static_attributes____classcell__)r5   s   @r   r	   r	      s~    4r 	)&) ) 	), ) ) ,  ,CC	BC Cr   )r=   
__future__r   typingr   r   numpyr   	gymnasiumr   gymnasium.corer   r   gymnasium.wrappers.utilsr   __all__r   r   r   r	   r   r   r   <module>rI      sW   
 # %   + 3 
hCKK'723SYY5T5ThCr   