
    h;                         S SK r S SKJrJrJrJrJr  S SKrS SK	r
S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJrJrJr  S SKJrJrJr  S S	KJrJr  \" S
SS9r  " S S\5      r!g)    N)AnyClassVarOptionalTypeVarUnion)spaces)
functional)RolloutBuffer)OnPolicyAlgorithm)ActorCriticCnnPolicyActorCriticPolicy
BasePolicyMultiInputActorCriticPolicy)GymEnvMaybeCallbackSchedule)FloatScheduleexplained_varianceSelfPPOPPO)boundc            5         ^  \ rS rSr% Sr\\\S.r\	\
\\\   4      \S'                           S-S\\\\   4   S\\\4   S\\\4   S	\S
\S\S\S\S\\\4   S\S\\4   S\S\S\S\S\S\S\\\      S\\
\\4      S\\   S\S\\   S\\
\\4      S\S\\   S\\R2                  \4   S\44U 4S  jjjrS.U 4S" jjrS.S# jr     S/S$\S%\S&\S'\S(\S)\S*\S!\4U 4S+ jjjrS,r U =r!$ )0r      a  
Proximal Policy Optimization algorithm (PPO) (clip version)

Paper: https://arxiv.org/abs/1707.06347
Code: This implementation borrows code from OpenAI Spinning Up (https://github.com/openai/spinningup/)
https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail and
Stable Baselines (PPO2 from https://github.com/hill-a/stable-baselines)

Introduction to PPO: https://spinningup.openai.com/en/latest/algorithms/ppo.html

:param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
:param env: The environment to learn from (if registered in Gym, can be str)
:param learning_rate: The learning rate, it can be a function
    of the current progress remaining (from 1 to 0)
:param n_steps: The number of steps to run for each environment per update
    (i.e. rollout buffer size is n_steps * n_envs where n_envs is number of environment copies running in parallel)
    NOTE: n_steps * n_envs must be greater than 1 (because of the advantage normalization)
    See https://github.com/pytorch/pytorch/issues/29372
:param batch_size: Minibatch size
:param n_epochs: Number of epoch when optimizing the surrogate loss
:param gamma: Discount factor
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
:param clip_range: Clipping parameter, it can be a function of the current progress
    remaining (from 1 to 0).
:param clip_range_vf: Clipping parameter for the value function,
    it can be a function of the current progress remaining (from 1 to 0).
    This is a parameter specific to the OpenAI implementation. If None is passed (default),
    no clipping will be done on the value function.
    IMPORTANT: this clipping depends on the reward scaling.
:param normalize_advantage: Whether to normalize or not the advantage
:param ent_coef: Entropy coefficient for the loss calculation
:param vf_coef: Value function coefficient for the loss calculation
:param max_grad_norm: The maximum value for the gradient clipping
:param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
    instead of action noise exploration (default: False)
:param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
    Default: -1 (only sample at the beginning of the rollout)
:param rollout_buffer_class: Rollout buffer class to use. If ``None``, it will be automatically selected.
:param rollout_buffer_kwargs: Keyword arguments to pass to the rollout buffer on creation
:param target_kl: Limit the KL divergence between updates,
    because the clipping is not enough to prevent large update
    see issue #213 (cf https://github.com/hill-a/stable-baselines/issues/213)
    By default, there is no limit on the kl div.
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
    the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation. See :ref:`ppo_policies`
:param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
    debug messages
:param seed: Seed for the pseudo random generators
:param device: Device (cpu, cuda, ...) on which the code should be run.
    Setting it to auto, the code will be run on the GPU if possible.
:param _init_setup_model: Whether or not to build the network at the creation of the instance
)	MlpPolicy	CnnPolicyMultiInputPolicypolicy_aliasesNpolicyenvlearning_raten_steps
batch_sizen_epochsgamma
gae_lambda
clip_rangeclip_range_vfnormalize_advantageent_coefvf_coefmax_grad_normuse_sdesde_sample_freqrollout_buffer_classrollout_buffer_kwargs	target_klstats_window_sizetensorboard_logpolicy_kwargsverboseseeddevice_init_setup_modelc                   > [         TU ]  UUUUUUUUUUUUUUUUUUUS[        R                  [        R                  [        R
                  [        R                  4S9  U(       a  US:  d   S5       eU R                  b  U R                  R                  U R                  -  nUS:  d2  U(       a+   SU R                   SU R                  R                   35       eUU-  nUU-  S:  aJ  [        R                  " SU S	U S
U SUU-   SU R                   SU R                  R                   S35        XPl        X`l        Xl        Xl        Xl        UU l        U(       a  U R%                  5         g g )NF)r    r!   r$   r%   r)   r*   r+   r,   r-   r.   r/   r1   r2   r3   r4   r6   r5   r7   supported_action_spaces   z_`batch_size` must be greater than 1. See https://github.com/DLR-RM/stable-baselines3/issues/440z=`n_steps * n_envs` must be greater than 1. Currently n_steps=z and n_envs=r   z(You have specified a mini-batch size of zA, but because the `RolloutBuffer` is of size `n_steps * n_envs = z`, after every zH untruncated mini-batches, there will be a truncated mini-batch of size zZ
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=))super__init__r   BoxDiscreteMultiDiscreteMultiBinaryr   num_envsr!   warningswarnr"   r#   r&   r'   r(   r0   _setup_model)selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   buffer_sizeuntruncated_batches	__class__s                                S/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/ppo/ppo.pyr=   PPO.__init__P   s   : 	'!'+!5"7/+'#

$$""	%+ 	 	
> Qqpq 88 ((++dll:K?'}Nt||n\himiqiqizizh{|}  #.";Z'!+>zl KWWbVc d$$7#8 9EEPS]E]D^ _&&*ll^<@Q@Q?RRSU % $*#6 "     returnc                   > [         TU ]  5         [        U R                  5      U l        U R                  bW  [        U R                  [        [        45      (       a  U R                  S:  d   S5       e[        U R                  5      U l        g g )Nr   zG`clip_range_vf` must be positive, pass `None` to deactivate vf clipping)r<   rE   r   r&   r'   
isinstancefloatint)rF   rI   s    rJ   rE   PPO._setup_model   su     (8)$,,ucl;;))A-{/{{-!.t/A/A!BD	 *rL   c           	      f   U R                   R                  S5        U R                  U R                   R                  5        U R	                  U R
                  5      nU R                  b  U R                  U R
                  5      n/ n/ / pT/ nSn[        U R                  5       GH  n/ n	U R                  R                  U R                  5       GH?  n
U
R                  n[        U R                  [        R                   5      (       a(  U
R                  R#                  5       R%                  5       nU R                   R'                  U
R(                  U5      u  pnUR%                  5       nU
R*                  nU R,                  (       a5  [/        U5      S:  a&  XR1                  5       -
  UR3                  5       S-   -  n[4        R6                  " XR8                  -
  5      nUU-  nU[4        R:                  " USU-
  SU-   5      -  n[4        R<                  " UU5      R1                  5       * nUR?                  URA                  5       5        [4        R0                  " [4        RB                  " US-
  5      U:  RE                  5       5      RA                  5       nUR?                  U5        U R                  c  UnO2U
RF                  [4        R:                  " XRF                  -
  W* U5      -   n[H        RJ                  " U
RL                  U5      nUR?                  URA                  5       5        Uc  [4        R0                  " U* 5      * nO[4        R0                  " U5      * nUR?                  URA                  5       5        UU RN                  U-  -   U RP                  U-  -   n[4        RR                  " 5          XR8                  -
  n[4        R0                  " [4        R6                  " U5      S-
  U-
  5      RU                  5       RW                  5       nU	R?                  U5        SSS5        U RX                  b9  WSU RX                  -  :  a&  SnU RZ                  S:  a  []        SU SUS	 35          OU R                   R                  R_                  5         URa                  5         [4        Rb                  Rd                  Rg                  U R                   Ri                  5       U Rj                  5        U R                   R                  Rm                  5         GMB     U =Rn                  S-  sl7        U(       a  GM    O   [q        U R                  Rr                  R%                  5       U R                  RL                  R%                  5       5      nU Rt                  Rw                  S
[x        R0                  " U5      5        U Rt                  Rw                  S[x        R0                  " U5      5        U Rt                  Rw                  S[x        R0                  " U5      5        U Rt                  Rw                  S[x        R0                  " W	5      5        U Rt                  Rw                  S[x        R0                  " U5      5        U Rt                  Rw                  SWRA                  5       5        U Rt                  Rw                  SU5        [{        U R                   S5      (       a`  U Rt                  Rw                  S[4        R6                  " U R                   R|                  5      R1                  5       RA                  5       5        U Rt                  Rw                  SU Rn                  SS9  U Rt                  Rw                  SU5        U R                  b  U Rt                  Rw                  SW5        gg! , (       d  f       GN= f)z<
Update policy using the currently gathered rollout buffer.
TNr:   g:0yE>g      ?FzEarly stopping at step z due to reaching max kl: z.2fztrain/entropy_lossztrain/policy_gradient_lossztrain/value_lossztrain/approx_klztrain/clip_fractionz
train/lossztrain/explained_variancelog_stdz	train/stdztrain/n_updatestensorboard)excludeztrain/clip_rangeztrain/clip_range_vf)?r   set_training_mode_update_learning_rate	optimizerr&   _current_progress_remainingr'   ranger#   rollout_buffergetr"   actionsrO   action_spacer   r?   longflattenevaluate_actionsobservations
advantagesr(   lenmeanstdthexpold_log_probclampminappenditemabsrP   
old_valuesFmse_lossreturnsr)   r*   no_gradcpunumpyr0   r4   print	zero_gradbackwardnnutilsclip_grad_norm_
parametersr+   step
_n_updatesr   valuesloggerrecordnphasattrrT   )rF   r&   r'   entropy_losses	pg_lossesvalue_lossesclip_fractionscontinue_trainingepochapprox_kl_divsrollout_datar^   r   log_probentropyrd   ratiopolicy_loss_1policy_loss_2policy_lossclip_fractionvalues_pred
value_lossentropy_lossloss	log_ratioapprox_kl_divexplained_vars                               rJ   train	PPO.train   s   
 	%%d+""4;;#8#89__T%E%EF
) ..t/O/OPM"$b< 4==)EN $ 3 3 7 7 H&..d//AA*22779AACG,0KK,H,HIbIbdk,l)'))44
++J!0C",/@"@Z^^EUX\E\!]J x*C*CCD !+U 2 *RXXeQ^QQ[^-\ \!vvm]CHHJJ   !1!1!34 "	):Z)G(N(N(P Q V V X%%m4%%-"(K #/"9"9BHH!8!88=.-= #K ZZ(<(<kJ
##JOO$56 ?$&GGXI$6#6L$&GGG$4#4L%%l&7&7&9:"T]]\%AADLLS]D]] ZZ\ (+D+D DI$&GGRVVI->-Bi,O$P$T$T$V$\$\$^M"))-8 "
 >>--#BV2V(-%||q( 7w>WXefiWjkl %%//1++DKK,B,B,DdFXFXY%%**,O !IR OOq O$$] *` +4+>+>+E+E+M+M+OQUQdQdQlQlQtQtQvw 	/1HI79KL-rww|/DE,bggn.EF0"''.2IJ<55}E4;;	**KK{BFF4;;3F3F,G,L,L,N,S,S,UV,doo}U-z:)KK4mD *K "\s   .A,^!!
^0rF   total_timestepscallbacklog_intervaltb_log_namereset_num_timestepsprogress_barc           	      (   > [         TU ]  UUUUUUS9$ )N)r   r   r   r   r   r   )r<   learn)rF   r   r   r   r   r   r   rI   s          rJ   r   	PPO.learn.  s-     w}+%# 3%  
 	
rL   )r"   r&   r'   r#   r(   r0   )ga2U0*3?i   @   
   gGz?gffffff?g?NTg              ?r   FNNNd   NNr   NautoT)rM   N)Nr:   r   TF)"__name__
__module____qualname____firstlineno____doc__r   r   r   r   r   dictstrtyper   __annotations__r   r   rP   r   rQ   boolr   r
   r   rh   r6   r=   rE   r   r   r   r   __static_attributes____classcell__)rI   s   @rJ   r   r      s   5p ')7=NHT#tJ'7"789  15 -06:$("!>B:>%)!$)-26"(."&7[ c4 1223[  63;[  UH_-	[ 
 [  [  [  [  [  %/*[  T5(23[  "[  [  [  [   ![ " #[ $ 'tM':;%[ &  (S#X7'[ ( E?)[ * +[ , "#-[ .  S#X//[ 0 1[ 2 sm3[ 4 biin%5[ 6  7[  [ z	CtEr #' $("


  
 	

 
 "
 
 

 
rL   )"rC   typingr   r   r   r   r   rv   r   torchrh   	gymnasiumr   torch.nnr	   rq    stable_baselines3.common.buffersr
   ,stable_baselines3.common.on_policy_algorithmr   !stable_baselines3.common.policiesr   r   r   r   %stable_baselines3.common.type_aliasesr   r   r   stable_baselines3.common.utilsr   r   r   r    rL   rJ   <module>r      sJ     : :    $ : J ~ ~ Q Q L
)5
)l

 l
rL   