
    h?                         S SK JrJrJrJrJr  S SKrS SKr	S SK
Jr  S SKJr  S SKJr  S SKJr  S SKJr  S SKJrJr  S S	KJrJrJr  S S
KJrJr  S SKJ r J!r!J"r"J#r#J$r$  \" SSS9r% " S S\5      r&g)    )AnyClassVarOptionalTypeVarUnionN)spaces)
functional)ReplayBuffer)ActionNoise)OffPolicyAlgorithm)
BasePolicyContinuousCritic)GymEnvMaybeCallbackSchedule)get_parameters_by_namepolyak_update)Actor	CnnPolicy	MlpPolicyMultiInputPolicy	SACPolicySelfSACSAC)boundc            9         ^  \ rS rSr% Sr\\\S.r\	\
\\\   4      \S'   \\S'   \\S'   \\S'   \\S'                             S5S\\\\   4   S
\\\4   S\\\4   S\S\S\S\S\S\\\\\4   4   S\S\\   S\\\      S\\
\\4      S\S\S\\\4   S\S\\\4   S\S\S\S\S\\   S \\
\\4      S!\S"\\   S#\\R<                  \4   S$\48U 4S% jjjrS6U 4S' jjr S6S( jr!S7S\S\S&S	4S) jjr"     S8S*\#S+\S,\$S-\S.\S/\S0\S&\#4U 4S1 jjjr%S&\&\   4U 4S2 jjr'S&\\&\   \&\   4   4S3 jr(S4r)U =r*$ )9r      a  
Soft Actor-Critic (SAC)
Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor,
This implementation borrows code from original implementation (https://github.com/haarnoja/sac)
from OpenAI Spinning Up (https://github.com/openai/spinningup), from the softlearning repo
(https://github.com/rail-berkeley/softlearning/)
and from Stable Baselines (https://github.com/hill-a/stable-baselines)
Paper: https://arxiv.org/abs/1801.01290
Introduction to SAC: https://spinningup.openai.com/en/latest/algorithms/sac.html

Note: we use double q target and not value target as discussed
in https://github.com/hill-a/stable-baselines/issues/270

:param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
:param env: The environment to learn from (if registered in Gym, can be str)
:param learning_rate: learning rate for adam optimizer,
    the same learning rate will be used for all networks (Q-Values, Actor and Value function)
    it can be a function of the current progress remaining (from 1 to 0)
:param buffer_size: size of the replay buffer
:param learning_starts: how many steps of the model to collect transitions for before learning starts
:param batch_size: Minibatch size for each gradient update
:param tau: the soft update coefficient ("Polyak update", between 0 and 1)
:param gamma: the discount factor
:param train_freq: Update the model every ``train_freq`` steps. Alternatively pass a tuple of frequency and unit
    like ``(5, "step")`` or ``(2, "episode")``.
:param gradient_steps: How many gradient steps to do after each rollout (see ``train_freq``)
    Set to ``-1`` means to do as many gradient steps as steps done in the environment
    during the rollout.
:param action_noise: the action noise type (None by default), this can help
    for hard exploration problem. Cf common.noise for the different action noise type.
:param replay_buffer_class: Replay buffer class to use (for instance ``HerReplayBuffer``).
    If ``None``, it will be automatically selected.
:param replay_buffer_kwargs: Keyword arguments to pass to the replay buffer on creation.
:param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
    at a cost of more complexity.
    See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
:param n_steps: When n_step > 1, uses n-step return (with the NStepReplayBuffer) when updating the Q-value network.
:param ent_coef: Entropy regularization coefficient. (Equivalent to
    inverse of reward scale in the original SAC paper.)  Controlling exploration/exploitation trade-off.
    Set it to 'auto' to learn it automatically (and 'auto_0.1' for using 0.1 as initial value)
:param target_update_interval: update the target network every ``target_network_update_freq``
    gradient steps.
:param target_entropy: target entropy when learning ``ent_coef`` (``ent_coef = 'auto'``)
:param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
    instead of action noise exploration (default: False)
:param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
    Default: -1 (only sample at the beginning of the rollout)
:param use_sde_at_warmup: Whether to use gSDE instead of uniform sampling
    during the warm up phase (before learning starts)
:param stats_window_size: Window size for the rollout logging, specifying the number of episodes to average
    the reported success rate, mean episode length, and mean reward over
:param tensorboard_log: the log location for tensorboard (if None, no logging)
:param policy_kwargs: additional arguments to be passed to the policy on creation. See :ref:`sac_policies`
:param verbose: Verbosity level: 0 for no output, 1 for info messages (such as device or wrappers used), 2 for
    debug messages
:param seed: Seed for the pseudo random generators
:param device: Device (cpu, cuda, ...) on which the code should be run.
    Setting it to auto, the code will be run on the GPU if possible.
:param _init_setup_model: Whether or not to build the network at the creation of the instance
)r   r   r   policy_aliasespolicyactorcriticcritic_targetNenvlearning_ratebuffer_sizelearning_starts
batch_sizetaugamma
train_freqgradient_stepsaction_noisereplay_buffer_classreplay_buffer_kwargsoptimize_memory_usagen_stepsent_coeftarget_update_intervaltarget_entropyuse_sdesde_sample_frequse_sde_at_warmupstats_window_sizetensorboard_logpolicy_kwargsverboseseeddevice_init_setup_modelc                    > [         TU ]  UUUUUUUUU	U
UUUUUUUUUUUUUU[        R                  4SS9  UU l        S U l        UU l        UU l        S U l        U(       a  U R                  5         g g )NT)r-   r.   r/   r0   r9   r7   r8   r:   r<   r;   r4   r5   r6   supported_action_spacessupport_multi_env)
super__init__r   Boxr3   log_ent_coefr1   r2   ent_coef_optimizer_setup_model)selfr   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   	__class__s                                S/home/james-whalen/.local/lib/python3.13/site-packages/stable_baselines3/sac/sac.pyrB   SAC.__init__[   s    > 	 3!5"7'/++/%+ZZM"5 	 	
: -  !&<#;?     returnc                 P  > [         TU ]  5         U R                  5         [        U R                  S/5      U l        [        U R                  S/5      U l        U R                  S:X  aa  [        [        R                  " U R                  R                  R                  5      R                  [        R                   5      * 5      U l        O[        U R                  5      U l        [#        U R$                  [&        5      (       a  U R$                  R)                  S5      (       a  SnSU R$                  ;   a4  [        U R$                  R+                  S5      S   5      nUS:  d   S5       e[,        R.                  " [,        R0                  " SU R2                  S9U-  5      R5                  S	5      U l        [,        R8                  R;                  U R6                  /U R=                  S5      S
9U l        g [,        R@                  " [        U R$                  5      U R2                  S9U l!        g )Nrunning_auto      ?_   g        z4The initial value of ent_coef must be greater than 0)r<   T)lr)"rA   rF   _create_aliasesr   r!   batch_norm_statsr"   batch_norm_stats_targetr3   floatnpprodr#   action_spaceshapeastypefloat32
isinstancer1   str
startswithsplitthlogonesr<   requires_grad_rD   optimAdamlr_schedulerE   tensorent_coef_tensor)rG   
init_valuerH   s     rI   rF   SAC._setup_model   s    6t{{ZL Q'=d>P>PS]R^'_$&("'1F1F1L1L)M)T)TUWU_U_)`(`"aD #((;(;"<D
 dmmS))dmm.F.Fv.N.NJdmm#"4==#6#6s#;A#>?
!C'_)__' !#rwwq'E
'R S b bcg hD&(hhmmT5F5F4GDL\L\]^L_m&`D#
 $&99U4==-A$++#VD rK   c                     U R                   R                  U l        U R                   R                  U l        U R                   R                  U l        g N)r   r    r!   r"   )rG   s    rI   rT   SAC._create_aliases   s6    [[&&
kk((![[66rK   c                   ^ U R                   R                  S5        U R                  R                  U R                  R                  /nU R
                  b  X0R
                  /-  nU R                  U5        / / pT/ / pv[        U5       GH  nU R                  R                  X R                  S9n	U	R                  b  U	R                  OU R                  n
U R                  (       a  U R                  R                  5         U R                  R                  U	R                   5      u  pUR#                  SS5      nS nU R
                  b  U R$                  b  [&        R(                  " U R$                  R+                  5       5      n[-        U R.                  [0        5      (       d   eU R$                  XR.                  -   R+                  5       -  R3                  5       * nUR5                  UR7                  5       5        OU R8                  nUR5                  UR7                  5       5        UbQ  U R
                  bD  U R
                  R;                  5         UR=                  5         U R
                  R?                  5         [&        R@                  " 5          U R                  R                  U	RB                  5      u  nn[&        RD                  " U RG                  U	RB                  U5      SS9n[&        RH                  " USSS9u  nnUUUR#                  SS5      -  -
  nU	RJ                  SU	RL                  -
  U
-  U-  -   mS S S 5        U R	                  U	R                   U	RN                  5      nS[Q        U4S jU 5       5      -  n[-        U[&        RR                  5      (       d   eUR5                  UR7                  5       5        U R                  R                  R;                  5         UR=                  5         U R                  R                  R?                  5         [&        RD                  " U R	                  U	R                   U5      SS9n[&        RH                  " USSS9u  nnX-  U-
  R3                  5       nUR5                  UR7                  5       5        U R                  R                  R;                  5         UR=                  5         U R                  R                  R?                  5         XRT                  -  S	:X  d  GMy  [W        U R                  RY                  5       U RF                  RY                  5       U RZ                  5        [W        U R\                  U R^                  S
5        GM     U =R`                  U-  sl0        U Rb                  Re                  SU R`                  SS9  U Rb                  Re                  S[f        R2                  " U5      5        U Rb                  Re                  S[f        R2                  " U5      5        U Rb                  Re                  S[f        R2                  " U5      5        [i        U5      S	:  a1  U Rb                  Re                  S[f        R2                  " U5      5        g g ! , (       d  f       GNF= f)NT)r#   rR   )dim)rr   keepdimg      ?c              3   R   >#    U  H  n[         R                  " UT5      v   M     g 7frn   )Fmse_loss).0	current_qtarget_q_valuess     rI   	<genexpr>SAC.train.<locals>.<genexpr>  s!     #m\lyAJJy/$J$J\ls   $'r   rP   ztrain/n_updatestensorboard)excludeztrain/ent_coefztrain/actor_lossztrain/critic_lossztrain/ent_coef_loss)5r   set_training_moder    	optimizerr!   rE   _update_learning_raterangereplay_buffersample_vec_normalize_env	discountsr)   r4   reset_noiseaction_log_probobservationsreshaperD   rb   expdetachr^   r3   rW   meanappenditemrj   	zero_gradbackwardstepno_gradnext_observationscatr"   minrewardsdonesactionssumTensorr2   r   
parametersr(   rU   rV   
_n_updatesloggerrecordrX   len)rG   r+   r'   
optimizersent_coef_losses	ent_coefsactor_lossescritic_lossesgradient_stepreplay_datar   
actions_pilog_probent_coef_lossr1   next_actionsnext_log_probnext_q_valuesrQ   current_q_valuescritic_lossq_values_pi	min_qf_pi
actor_lossry   s                           @rI   train	SAC.train   s
   %%d+jj**DKK,A,AB
"".2233J 	"":.%'&("m">2M,,33JD[D[3\K1<1F1F1R--X\XbXbI ||

&&( $(::#=#=k>V>V#W J''A.H M&&2t7H7H7T 66$"3"3":":"<=!$"5"5u===="&"3"3xBUBU7U6]6]6_"_!e!e!g g&&}'9'9';<//X]]_- (T-D-D-P''113&&('',,..2jj.H.HIfIf.g+m "t'9'9+:W:WYe'flm n#%66-Q#M q -=;P;PQSUV;W0W W"-"5"5[=N=N9NR[8[^k8k"k   ${{;+C+C[EXEXY #m\l#m mmKk2995555  !1!1!34 KK!!++-  "KK!!&&(
 &&[-E-Ez!RXYZK66+1dCLIq"-	9??AJ
 12 JJ  **,!JJ  %%' :::a?dkk4468J8J8U8U8WY]YaYabd33T5Q5QSVW[ 3^ 	>),doo}U+RWWY-?@-rww|/DE.0FG!#KK4bggo6NO $e s   B+Y88
Z	rG   total_timestepscallbacklog_intervaltb_log_namereset_num_timestepsprogress_barc           	      (   > [         TU ]  UUUUUUS9$ )N)r   r   r   r   r   r   )rA   learn)rG   r   r   r   r   r   r   rH   s          rI   r   	SAC.learn0  s-     w}+%# 3%  
 	
rK   c                 *   > [         TU ]  5       / SQ-   $ )N)r    r!   r"   )rA   _excluded_save_params)rG   rH   s    rI   r   SAC._excluded_save_paramsB  s    w,.1UUUrK   c                 ^    / SQnU R                   b  S/nUR                  S5        X4$ S/nX4$ )N)r   zactor.optimizerzcritic.optimizerrD   rE   rj   )rE   r   )rG   state_dictssaved_pytorch_variabless      rI   _get_torch_save_paramsSAC._get_torch_save_paramsE  sJ    G"".'5&6#34 33 (9&9#33rK   )r    rU   rV   r!   r"   r1   rE   rj   rD   r3   r2   )ga2U0*3?i@B d      g{Gzt?gGz?rR   rR   NNNFrR   rO   rR   rO   Frq   Fr   NNr   NrO   T)rL   N)@   )N   r   TF)+__name__
__module____qualname____firstlineno____doc__r   r   r   r   r   dictr_   typer   __annotations__r   r   r   r   r   rW   r   inttupler   r   r
   r   boolrb   r<   rB   rF   rT   r   r   r   r   listr   r   __static_attributes____classcell__)rH   s   @rI   r   r      s+   ;| ,=NHT#tJ'7"789 
 L## 15$"23.2<@9=&+&,&',2!"'!$)-26"(."&;E c4	?*+E  63;E  UH_-	E 
 E  E  E  E  E  #uS#X./E  E  {+E  &d<&89E  'tCH~6E   $E   !E " U
##E $ !$%E & c5j)'E ( )E * +E ,  -E . /E 0 "#1E 2  S#X/3E 4 5E 6 sm7E 8 biin%9E :  ;E  E N!WF7
dPC dPS dP$ dPR #' $("


  
 	

 
 "
 
 

 
$VtCy V4d3ic.B(C 4 4rK   )'typingr   r   r   r   r   numpyrX   torchrb   	gymnasiumr   torch.nnr	   ru    stable_baselines3.common.buffersr
   stable_baselines3.common.noiser   -stable_baselines3.common.off_policy_algorithmr   !stable_baselines3.common.policiesr   r   %stable_baselines3.common.type_aliasesr   r   r   stable_baselines3.common.utilsr   r   stable_baselines3.sac.policiesr   r   r   r   r   r   r    rK   rI   <module>r      sM    : :    $ 9 6 L J Q Q P c c
)5
)y4
 y4rK   