
    h9                         S SK r S SKrS SKrS SKJrJrJrJr  S SKr	S SK
rS SKJr  S SKJr  S SKJrJr  S SKJr  S SKJr  S SKJrJrJr  S S	KJrJr  S S
KJr  \" SSS9r  " S S\5      r!g)    N)AnyOptionalTypeVarUnion)spaces)BaseAlgorithm)DictRolloutBufferRolloutBuffer)BaseCallbackActorCriticPolicy)GymEnvMaybeCallbackSchedule)obs_as_tensor	safe_mean)VecEnvSelfOnPolicyAlgorithmOnPolicyAlgorithm)boundc            0         ^  \ rS rSr% Sr\\S'   \\S'              S0S\\	\
\   4   S\\\	4   S\\\4   S\S	\S
\S\S\S\S\S\S\\
\      S\\\	\4      S\S\\	   S\S\\\	\4      S\S\\   S\\R(                  \	4   S\S\\\
\R.                     S4      4,U 4S jjjrS1S jrS2S\	SS4S  jjrS\S!\S\S"\S\4
S# jrS1S$ jrS3S%\SS4S& jjr     S4S'\ S(\S!\!S)\S*\	S+\S,\S\ 4S- jjr"S\\#\	   \#\	   4   4S. jr$S/r%U =r&$ )5r      a  
The base for On-Policy algorithms (ex: A2C/PPO).

:param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
:param env: The environment to learn from (if registered in Gym, can be str)
:param learning_rate: The learning rate, it can be a function
    of the current progress remaining (from 1 to 0)
:param n_steps: The number of steps to run for each environment per update
    (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
:param gamma: Discount factor
:param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator.
    Equivalent to classic advantage when set to 1.
:param ent_coef: Entropy coefficient for the loss calculation
:param vf_coef: Value function coefficient for the loss calculation
:param max_grad_norm: The maximum value for the gradient clipping
:param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
    instead of action noise exploration (default: False)
:param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
    Default: -1 (only sample at the beginning of the rollout)
:param rollout_buffer_class: Rollout buffer class to use. If ``None``, it will be automatically selected.
:param rollout_buffer_kwargs: Keyword arguments to pass to the rollout buffer on creation.
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
    the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param monitor_wrapper: When creating an environment, whether to wrap it
    or not in a Monitor wrapper.
:param policy_kwargs: additional arguments to be passed to the policy on creation
:param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
    debug messages
:param seed: Seed for the pseudo random generators
:param device: Device (cpu, cuda, ...) on which the code should be run.
    Setting it to auto, the code will be run on the GPU if possible.
:param _init_setup_model: Whether or not to build the network at the creation of the instance
:param supported_action_spaces: The action spaces supported by the algorithm.
rollout_bufferpolicyNenvlearning_raten_stepsgamma
gae_lambdaent_coefvf_coefmax_grad_normuse_sdesde_sample_freqrollout_buffer_classrollout_buffer_kwargsstats_window_sizetensorboard_logmonitor_wrapperpolicy_kwargsverboseseeddevice_init_setup_modelsupported_action_spaces.c                    > [         TU ]  UUUUUUU
USUUUUUS9  X@l        XPl        X`l        Xpl        Xl        Xl        Xl        U=(       d    0 U l	        U(       a  U R                  5         g g )NT)r   r   r   r*   r+   r-   r#   r$   support_multi_envr)   r,   r'   r(   r/   )super__init__r   r   r   r    r!   r"   r%   r&   _setup_model)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   	__class__s                          f/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/common/on_policy_algorithm.pyr3   OnPolicyAlgorithm.__init__=   s    2 	''+"+/+$; 	 	
" 
$ *$8!%:%@b"     returnc                    U R                  5         U R                  U R                  5        U R                  c@  [	        U R
                  [        R                  5      (       a  [        U l        O[        U l        U R                  " U R                  U R
                  U R                  4U R                  U R                  U R                  U R                  S.U R                   D6U l        U R$                  " U R
                  U R                  U R&                  4SU R(                  0U R*                  D6U l        U R,                  R/                  U R                  5      U l        U R1                  5         g )N)r-   r   r   n_envsr#   )_setup_lr_scheduleset_random_seedr,   r%   
isinstanceobservation_spacer   Dictr	   r
   r   action_spacer-   r   r   r<   r&   r   policy_classlr_scheduler#   r*   r   to_maybe_recommend_cpur5   s    r7   r4   OnPolicyAlgorithm._setup_models   s!   !TYY'$$,$00&++>>,=),9)"77LL""	
 ;;**;;	
 ((	
 ''""D$5$5t7G7G
QUQ]Q]
aeasas
 kknnT[[1!!#r9   mlp_class_namec                     U R                   R                  nU R                  [        R                  " S5      :w  a=  X!:X  a7  [        R
                  " SU R                  R                   SU S3[        5        ggg)zx
Recommend to use CPU only when using A2C/PPO with MlpPolicy.

:param: The name of the class for the default MlpPolicy.
cpuzYou are trying to run zg on the GPU, but it is primarily intended to run on the CPU when not using a CNN policy (you are using a/   which should be a MlpPolicy). See https://github.com/DLR-RM/stable-baselines3/issues/1245 for more info. You can pass `device='cpu'` or `export CUDA_VISIBLE_DEVICES=` to force using the CPU.Note: The model will train, but the GPU utilization will be poor and the training might take longer than on CPU.N)rC   __name__r-   thwarningswarnr6   UserWarning)r5   rI   policy_class_names      r7   rF   &OnPolicyAlgorithm._maybe_recommend_cpu   ss     !--66;;"))E**/@/RMM()@)@(A B""3!4 5>> 
 0S*r9   callbackn_rollout_stepsc                    U R                   c   S5       eU R                  R                  S5        SnUR                  5         U R                  (       a%  U R                  R                  UR                  5        UR                  5         XT:  Ga!  U R                  (       aG  U R                  S:  a7  XPR                  -  S:X  a%  U R                  R                  UR                  5        [        R                  " 5          [        U R                   U R                  5      nU R                  U5      u  pxn	SSS5        WR                  5       R                  5       nUn
[        U R                   ["        R$                  5      (       av  U R                  R&                  (       a  U R                  R)                  U
5      n
O?[*        R,                  " XpR                   R.                  U R                   R0                  5      n
UR3                  U
5      u  ppU =R4                  UR                  -  sl        UR7                  [9        5       5        UR;                  5       (       d  gU R=                  X5        US-  n[        U R                   ["        R>                  5      (       a  URA                  SS5      n[C        U5       H  u  nnU(       d  M  X   RE                  S5      c  M%  X   RE                  SS5      (       d  M@  U R                  RG                  X   S   5      S   n[        R                  " 5          U R                  RI                  U5      S   nSSS5        X==   U RJ                  W-  -  ss'   M     URM                  U R                   UUU RN                  WW	5        Xl         Xl'        XT:  a  GM!  [        R                  " 5          U R                  RI                  [        WU R                  5      5      nSSS5        URQ                  WWS	9  UR7                  [9        5       5        URS                  5         g
! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       Nl= f)am  
Collect experiences using the current policy and fill a ``RolloutBuffer``.
The term rollout here refers to the model-free notion and should not
be used with the concept of rollout used in model-based RL or planning.

:param env: The training environment
:param callback: Callback that will be called at each step
    (and at the beginning and end of the rollout)
:param rollout_buffer: Buffer to fill with rollouts
:param n_rollout_steps: Number of experiences to collect per environment
:return: True if function returned with at least `n_rollout_steps`
    collected, False if callback terminated rollout prematurely.
Nz$No previous observation was providedFr      terminal_observationzTimeLimit.truncated)last_valuesdonesT)*	_last_obsr   set_training_moderesetr#   reset_noisenum_envson_rollout_startr$   rM   no_gradr   r-   rK   numpyr?   rB   r   Boxsquash_outputunscale_actionnpcliplowhighstepnum_timestepsupdate_localslocalson_step_update_info_bufferDiscretereshape	enumerategetobs_to_tensorpredict_valuesr   add_last_episode_startscompute_returns_and_advantageon_rollout_end)r5   r   rS   r   rT   r   
obs_tensoractionsvalues	log_probsclipped_actionsnew_obsrewardsrZ   infosidxdoneterminal_obsterminal_values                      r7   collect_rollouts"OnPolicyAlgorithm.collect_rollouts   sF   ( ~~)Q+QQ)%%e,<<KK##CLL1!!#'|| 4 4q 8WG[G[=[_`=`''5*4>>4;;G
-1[[-D*  kkm))+G &O$++VZZ88;;,, '+kk&@&@&QO ')ggg7H7H7L7LdN_N_NdNd&eO-0XXo-F*Ge#,,. ""68,##%%$$U2qLG$++V__==!//"a0 'u-	TD
'=>J
'<eDD#';;#<#<UZH^=_#`ab#cL)-)C)CL)QRS)T &LDJJ$??L . )) %N(-%} '@ ZZ\[[//gt{{0STF  	44u4Uvx(!K X & \s$   ;5P6-Q0Q6
Q
Q	
Q(c                     [         e)zb
Consume current rollout data and update policy parameters.
Implemented by individual algorithms.
)NotImplementedErrorrG   s    r7   trainOnPolicyAlgorithm.train  s
    
 "!r9   	iterationc           
         U R                   c   eU R                  c   e[        [        R                  " 5       U R
                  -
  S-  [        R                  R                  5      n[        U R                  U R                  -
  U-  5      nUS:  a  U R                  R                  SUSS9  [        U R                   5      S:  a  [        U R                   S   5      S:  a  U R                  R                  S[        U R                    Vs/ s H  oDS   PM	     sn5      5        U R                  R                  S	[        U R                    Vs/ s H  oDS
   PM	     sn5      5        U R                  R                  SU5        U R                  R                  S[        U5      SS9  U R                  R                  SU R                  SS9  [        U R                  5      S:  a/  U R                  R                  S[        U R                  5      5        U R                  R!                  U R                  S9  gs  snf s  snf )z9
Write log.

:param iteration: Current logging iteration
Ng    eAr   ztime/iterationstensorboard)excludezrollout/ep_rew_meanrzrollout/ep_len_meanlztime/fpsztime/time_elapsedztime/total_timestepszrollout/success_rate)rj   )ep_info_bufferep_success_buffermaxtimetime_ns
start_timesys
float_infoepsilonintrk   _num_timesteps_at_startloggerrecordlenr   dump)r5   r   time_elapsedfpsep_infos        r7   	dump_logsOnPolicyAlgorithm.dump_logs  s    ""...%%111DLLNT__<CS^^E[E[\4%%(D(DDTUq=KK0)]St""#a'C0C0CA0F,G!,KKK4i]a]p]p@q]pRY]p@q6rsKK4i]a]p]p@q]pRY]p@q6rs:s+.L0A=Y143E3E}]t%%&*KK5yAWAW7XYd001 Ar@qs   
H6H;r5   total_timestepslog_intervaltb_log_namereset_num_timestepsprogress_barc                 $   SnU R                  UUUUU5      u  pUR                  [        5       [        5       5        U R                  c   eU R
                  U:  a  U R                  U R                  X R                  U R                  S9nU(       d  OnUS-  nU R                  U R
                  U5        Ub(  Xs-  S:X  a   U R                  c   eU R                  U5        U R                  5         U R
                  U:  a  M  UR                  5         U $ )Nr   )rT   rV   )_setup_learnon_training_startrm   globalsr   rk   r   r   r   "_update_current_progress_remainingr   r   r   on_training_end)	r5   r   rS   r   r   r   r   r   continue_trainings	            r7   learnOnPolicyAlgorithm.learn,  s	    	$($5$5%
! 	""68WY7xx###  ?2 $ 5 5dhhJ]J]oso{o{ 5 |$NI33D4F4FX 'I,D,I**666y)JJL   ?2  	  "r9   c                     SS/nU/ 4$ )Nr   zpolicy.optimizer )r5   state_dictss     r7   _get_torch_save_params(OnPolicyAlgorithm._get_torch_save_paramsW  s    !34Br9   )rw   r[   r    r   r   r"   r   r   r   r%   r&   r!   )NNd   NTNr   NautoTN)r:   Nr   )r   )NrV   r   TF)'rL   
__module____qualname____firstlineno____doc__r
   __annotations__r   r   strtyper   floatr   r   boolr   dictr   rM   r-   tupler   Spacer3   r4   rF   r   r   r   r   r   r   r   r   listr   __static_attributes____classcell__)r6   s   @r7   r   r      s   "H "! ?C:>!$)- $26"(."&LP/4 c4 12234  63;4  UH_-	4 
 4  4  4  4  4  4  4  4  'tM':;4   (S#X74  4   "#!4 " #4 $  S#X/%4 & '4 ( sm)4 * biin%+4 ,  -4 . "*%V\\0BC0G*H!I/4  4 l$63 QU (jj j &	j
 j 
jX"23 2t 24 #'.$(")#))  ) 	)
 ) ") ) 
)Vd3ic.B(C  r9   )"r   r   rN   typingr   r   r   r   rb   rf   torchrM   	gymnasiumr   #stable_baselines3.common.base_classr    stable_baselines3.common.buffersr	   r
   "stable_baselines3.common.callbacksr   !stable_baselines3.common.policiesr   %stable_baselines3.common.type_aliasesr   r   r   stable_baselines3.common.utilsr   r    stable_baselines3.common.vec_envr   r   r   r   r9   r7   <module>r      sR    
   0 0    = M ; ? Q Q C 3 7?RS E Er9   