
    ni`Y                         S r SSKJr  SSKJr  SSKJr  SSKr SSKJs  Jr	  SSKJr  SSKJr  SS	KJr   " S
 S\5      rg! \
 a    SSKr	 N*f = f)zProximal Policy Optimization algorithm.

Based on John Schulman's implementation in Python and Theano:
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
    )absolute_import)division)print_functionN   )memory)	normalize)utilityc                   l    \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rS rS rS rS rS rS rSrg)PPOAlgorithm$   zBA vectorized implementation of the PPO algorithm by John Schulman.c           	      	  ^ TU l         X l        X0l        X@l        XPl        [
        R                  " U R                   R                  S   SSSSS9U l        [
        R                  " U R                   R                  S   SSSSS9U l
        U R                   R                  S   U R                   R                  S   U R                   R                  S   U R                   R                  S   U R                   R                  S   4n[        R                  " XeR                  UR                  S	5      U l        ["        R$                  " SS5      U l        U R                  R(                  =(       a    [*        R,                  " 5       n["        R.                  " U(       a  S
OS5         U R                   R                  R0                  S   R2                  n["        R4                  " S[6        R8                  " UR:                  XX5      5      U l        U R=                  ["        R>                  " U R                   R                  5      SS2S4   ["        R@                  " [C        U R                   5      5      5      n	["        RD                  " S5         [        R                  " U[C        T5      UR                  S5      U l#        U	RH                  c  SU l%        O["        RL                  RN                  RP                  RS                  U4S jU	RH                  5        ["        RL                  RN                  RP                  RS                  S U	RH                  5      U l%        ["        R$                  " ["        R>                  " U R                   R                  5      SSS9U l*        ["        R$                  " ["        R>                  " U R                   R                  5      SSS9U l+        ["        R$                  " ["        R>                  " U R                   R                  5      SSS9U l,        SSS5        SSS5        ["        R$                  " U R                  RZ                  S["        R\                  S9U l/        U R                  Ra                  U R                  Rb                  5      U l2        g! , (       d  f       N= f! , (       d  f       N= f)ac  Create an instance of the PPO algorithm.

Args:
  batch_env: In-graph batch environment.
  step: Integer tensor holding the current training step.
  is_training: Boolean tensor for whether the algorithm should train.
  should_log: Boolean tensor for whether summaries should be returned.
  config: Object containing the agent configuration as attributes.
r   T   normalize_observ)centerscaleclipnameF
   normalize_rewardr   z/gpu:0z/cpu:0r   networkNppo_temporaryepisodesc                 v   > U R                  [        T5      /U R                  R                  5       SS  -   5      $ )Nr   )	set_shapelenshapeas_list)x	batch_envs    \/home/james-whalen/.local/lib/python3.13/site-packages/pybullet_envs/agents/ppo/algorithm.py<lambda>'PPOAlgorithm.__init__.<locals>.<lambda>U   s,    S^$4qww7H7L$LM    c                 :   ^  [         R                  " U 4S jS5      $ )Nc                  0   > [         R                  " T 5      $ N)tf
zeros_liker   s   r    r!   9PPOAlgorithm.__init__.<locals>.<lambda>.<locals>.<lambda>X   s    BMM!,<r#   F)r'   Variabler)   s   `r    r!   r"   X   s    $<eDr#   last_action)r   	last_meanlast_logstd)dtype)3
_batch_env_step_is_training_should_log_configr   StreamingNormalizeobserv_observ_filterreward_reward_filteractionr   EpisodeMemoryupdate_every
max_length_memoryr'   r+   _memory_indexuse_gpur	   available_gpusdevicer   valuemake_template	functoolspartialr   _networkr(   onesr   variable_scope	_episodesstate_last_statecontrib	frameworknestmap_structure_last_action
_last_mean_last_logstdkl_init_penaltyfloat32_penalty	optimizerlearning_rate
_optimizer)
selfr   stepis_training
should_logconfigtemplater@   action_sizeoutputs
    `        r    __init__PPOAlgorithm.__init__'   s[     DOJ#!L#66t7M7Ma7P>B=A<=<N	PD
 $66t7M7Ma7P>C=A<><N	PD &&q)4??+A+A!+DdooF\F\]^F_&&q)4??+A+A!+DFH''2E2EvGXGXZbcDLQ.Dll""?w'='='?G	w8H	5OO**00399k&&y'0'8'8']_dm}}
--..
/4
8"''#dooBV:WYf_---hIHYHY.8:<<!$
 **


#
#
1
1Mv||]  ZZ1166DDDfllT$
KKdoo6L6L(M(--:< ++bmmDOO4J4J&K&++68 KKdoo6L6L(M(--:<% . 
68 KK < <e2::VDMll,,T\\-G-GHDO- .- 
6	5s&   !C'R1FR R1 
R.	*R11
R?c                    [         R                  " S5         U R                  c  [         R                  " 5       nO![        R
                  " U R                  U5      nU R                  R                  U5      n[         R                  " X#/5         [         R                  " S5      sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zReset the recurrent states and stored episode.

Args:
  agent_indices: Tensor containing current batch indices.

Returns:
  Summary tensor.
zbegin_episode/N )
r'   
name_scoperL   no_opr	   reinit_nested_varsrJ   clearcontrol_dependenciesconstant)rZ   agent_indicesreset_statereset_buffers       r    begin_episodePPOAlgorithm.begin_episodee   s     
'	(				!hhj001A1A=Q^^))-8l""K#>?{{2 @? 
)	( @?? 
)	(	(s$   A7CB7$	C7
C	C
C c                 b  ^^^^ [         R                  " S5         U R                  R                  U5      nU R                  c  SnOB[         R
                  R                  R                  R                  U4S jU R                  5      nU R                  USS2S4   [         R                  " UR                  S   5      U5      m[         R                  " U R                  TR                  R                  U4S j5      mTR                  R!                  T5      SS2S4   m[         R                  " U R"                  UUU4S j[$        5      nU R                  c  [         R&                  " 5       nO,[(        R*                  " U R                  TR,                  T5      n[         R.                  " U[         R0                  " U R2                  TTSS2S4   5      [         R0                  " U R4                  TTR6                  SS2S4   5      [         R0                  " U R8                  TTR:                  SS2S4   5      /5         [         R<                  " TSS2S4   S5      [         R>                  " U5      4sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zCompute batch of actions and a summary for a batch of observation.

Args:
  agent_indices: Tensor containing current batch indices.
  observ: Tensor of a batch of observations for all agents.

Returns:
  Tuple of action batch tensor and summary tensor.
zperform/Nc                 2   > [         R                  " U T5      $ r&   )r'   gather)r   rl   s    r    r!   &PPOAlgorithm.perform.<locals>.<lambda>   s    "))A}B]r#   r   c                     > T R                   $ r&   )mean)ra   s   r    r!   rt      s	    r#   c                    > [         R                  R                  [         R                  R                  STR                  S S 2S4   5      [         R                  R                  S[         R
                  " TR                  S S 2S4   5      5      [         R                  R                  ST S S 2S4   5      [         R                  R                  ST5      /5      $ )Nrv   r   stdr:   logprob)r'   summarymerge	histogramrv   explogstd)r:   ry   ra   s   r    r!   rt      s    BJJ$4$4jj""66;;q!t+<=jj""5"&&q!t1D*EFjj""8VAqD\:jj""9g6	6 %r#   r:   ) r'   rf   r7   	transformrL   rM   rN   rO   rP   rG   rH   r   condr2   policysamplelog_probr3   strrg   r	   assign_nested_varsrK   rj   scatter_updaterQ   rR   rv   rS   r~   check_numericsidentity)	rZ   rl   r6   rK   rz   assign_stater:   ry   ra   s	    `    @@@r    performPPOAlgorithm.performw   s    
z	""",,V4f				!

$$))778]8<8H8HJ}}VAtG_bggfll1o.FNfwwt((&--*>*>@STf&&v.q!t4g


 
 g 
			!xxz11$2B2BFLLR_`""



D--}fQTl
K


DOO]FKK1<M
N


D--}fmmAqD>Q
R	$ 	   1x8"++g:NN	 	/ 
#	".	 	 	/ 
#	"	"s$   H#J >4J2	J 
J	J  
J.c           	         ^ ^^^^ [         R                  " S5         [         R                  " T R                  UUUUU 4S j[        5      sSSS5        $ ! , (       d  f       g= f)a  Process the transition tuple of the current step.

When training, add the current transition tuple to the memory and update
the streaming statistics for observations and rewards. A summary string is
returned if requested at this step.

Args:
  agent_indices: Tensor containing current batch indices.
  observ: Batch tensor of observations.
  action: Batch tensor of actions.
  reward: Batch tensor of rewards.
  unused_done: Batch tensor of done flags.
  unused_nextob: Batch tensor of successor observations.

Returns:
  Summary tensor.
zexperience/c                  ,   > TR                  TTT T5      $ r&   )_define_experience)r:   rl   r6   r8   rZ   s   r    r!   )PPOAlgorithm.experience.<locals>.<lambda>   s    $))-Pr#   Nr'   rf   r   r2   r   )rZ   rl   r6   r:   r8   unused_doneunused_nextobs   `````  r    
experiencePPOAlgorithm.experience   s;    $ 
}	%WW



P
P
	 
&	%	%s   .A
A"c                 t  ^ ^^	^
 [         R                  R                  T R                  R	                  U5      T R
                  R	                  U5      /5      m
[         R                  " T
/5         T R                  R                  (       a  T R                  nX#[         R                  " T R                  U5      [         R                  " T R                  U5      U4nT R                  R                  XQ5      nSSS5        [         R                  " W/5         T R                  R                  U5      m[         R                   " T R
                  R                  U5      5      m	[         R"                  " T R$                  UU	U U
4S j[&        5      nUsSSS5        $ ! , (       d  f       N= f! , (       d  f       g= f)z=Implement the branch of experience() entered during training.Nc                    > [         R                  R                  TTR                  R                  5       TR                  R                  5       [         R                  R                  STR                  5      [         R                  R                  ST 5      [         R                  R                  STR                  5      [         R                  R                  ST5      /5      $ )Nmemory_sizenormalized_observr:   normalized_reward)	r'   rz   r{   r7   r9   scalarr?   r|   rQ   )norm_observnorm_rewardrZ   update_filterss   r    r!   1PPOAlgorithm._define_experience.<locals>.<lambda>   s    BJJ$4$4!!))+!!))+jjt/A/ABjj""#6Djj""8T->->?jj 3[A6 %r#   )r'   rz   r{   r7   updater9   rj   r4   train_on_agent_actionrQ   rs   rR   rS   rJ   appendr   reduce_meanr   r3   r   )rZ   rl   r6   r:   r8   batchr   rz   r   r   r   s   `       @@@r    r   PPOAlgorithm._define_experience   sK   ZZ%%				#	#F	+				#	#F	+	-.N 
	 	 .!1	2		+	+""ryy)6 89;4CTCTCP:RSY[e ~~$$U:f 
3 
	 	 &	*''11&9kNN4#6#6#@#@#HIk


  	g  
+	* 
3	2 
+	*s   /BFA9F)
F&)
F7c                    ^ ^ [         R                  " S5         [         R                  " T R                  UU 4S j[        5      sSSS5        $ ! , (       d  f       g= f)a  Add episodes to the memory and perform update steps if memory is full.

During training, add the collected episodes of the batch indices that
finished their episode to the memory. If the memory is full, train on it,
and then clear the memory. A summary string is returned if requested at
this step.

Args:
  agent_indices: Tensor containing current batch indices.

Returns:
   Summary tensor.
zend_episode/c                  &   > TR                  T 5      $ r&   )_define_end_episode)rl   rZ   s   r    r!   *PPOAlgorithm.end_episode.<locals>.<lambda>   s    0H0H0Wr#   Nr   )rZ   rl   s   ``r    end_episodePPOAlgorithm.end_episode   s5     
~	&WWT&&(WY\] 
'	&	&s   +A
Ac                    U R                   R                  U5      u  p#U R                  R                  U R                  -
  n[
        R                  " [
        R                  " [
        R                  " U5      S   U5      5      nU Vs/ s H  n[
        R                  " Xe5      PM     nnU R                  R                  U[
        R                  " X55      XPR                  -   5      n[
        R                  " U/5         U R                  R                  [
        R                  " U5      S   5      nSSS5        [
        R                  " W/5         U R                  U R                  R                  :  n	[
        R                  " XR                  [         5      sSSS5        $ s  snf ! , (       d  f       N|= f! , (       d  f       g= f)z>Implement the branch of end_episode() entered during training.r   N)rJ   datar4   r<   r?   r'   rangeminimumr   rs   r>   replacerj   
assign_addr   	_trainingr   )
rZ   rl   r   length
space_leftuse_episodeselemr   	inc_indexmemory_fulls
             r    r    PPOAlgorithm._define_end_episode   s8   ~~**=9H**T-?-??J88BJJrxx'>q'A:NOL:BC($		$-(HC\\!!(BIIf,K".1C1C"CEF		 	 &	*$$//0Fq0IJi 
+		 	 )	-&&$,,*C*CCkWW[..#6 
.	- D 
+	*	-	-s    F!=3F&AF7&
F47
Gc           	      t   [         R                  " S5         [         R                  " U R                  U R                  R
                  5      n[         R                  " U/5         U R                  R                  5       nSSS5        Wu  u  p4pVpx[         R                  " [         R                  " US5      /5         [         R                  " U5      nSSS5        U R                  R                  U5      nU R                  R                  U5      nU R                  X4XVUU5      n	[         R                  " U	/5         U R                  X5Xh5      n
SSS5        [         R                  " W
/5         [         R                   " U R                  R#                  5       U R                  R%                  S5      5      nSSS5        [         R                  " W/5         [&        R(                  " [         R*                  " 5       U R                  R,                  5      n[         R.                  R1                  XU/5      sSSS5        sSSS5        $ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN$= f! , (       d  f       N= f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zPerform multiple training iterations of both policy and value baseline.

Training on the episodes collected in the memory. Reset the memory
afterwards. Always returns a summary string.

Returns:
  Summary tensor.
trainingNr   )r'   rf   assert_equalr?   r4   r<   rj   r>   r   assert_greaterr   r7   r   r9   _perform_update_steps_adjust_penaltygroupri   assignr	   variable_summariestrainable_variablesweight_summariesrz   r{   )rZ   assert_fullr   r6   r:   old_mean
old_logstdr8   r   update_summarypenalty_summaryclear_memoryweight_summarys                r    r   PPOAlgorithm._training   s    
z	"OOD$6$68Q8QRk""K=1||  " 2?C<4vxV""B$5$5fa$@#ABV$ C"",,V4f"",,V4f11&(X^28:n""N#34..vT 5""O#45xx 2 2 4d6H6H6O6OPQ6RS 6""L>2 33B4J4J4L48LL4Q4QSzz. QR 32 
#	"11 CB 5455222 
#	"	"s   AJ)$I?<J);IA)J);I+J)-A	I=6J)AJ4	J)
I	J)
I(	#J)+
I:	5J)=
J	J)
J	J))
J7c           
        ^ ^^^^^^^ [         R                  " TTT R                  R                  5      nT R	                  TT5      R
                  nT R                  R                  (       aC  [         R                  " TUTT R                  R                  T R                  R                  5      mOXx-
  m[        R                  R                  TSS/SS9u  pTU	-
  [        R                  " U
5      S-   -  m[        R                  " T[        R                  " U5      [        R                  " U5      /S5      m[        R                  " T[        R                  " T5      /S5      m[        R                  " UUUUUUUU 4S j[        R                  " T R                  R                   5      / S	QSS
9u  pn[        R"                  " [        R                  " S[        R                  " U5      /S5      [        R                  " S[        R                  " U5      /S5      5      n[        R$                  " XU/5         UT R                  R                   S-     sSSS5        $ ! , (       d  f       g= f)a*  Perform multiple update steps of value function and policy.

The advantage is computed once at the beginning and shared across
iterations. We need to decide for the summary of one iteration, and thus
choose the one after half of the iterations.

Args:
  observ: Sequences of observations.
  action: Sequences of actions.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  reward: Sequences of rewards.
  length: Batch of sequence lengths.

Returns:
  Summary tensor.
r   r   T)axes	keep_dimsg:0yE>zreturn and value: znormalized advantage: c           	      2   > T	R                  TTTTTTT5      $ r&   )_update_step)
_1_2r:   	advantager   r6   r   r   r8   rZ   s
     r    r!   4PPOAlgorithm._perform_update_steps.<locals>.<lambda>1  s     d>O>O*fi?Ir#   )        r   re   )parallel_iterationszvalue loss: zpolicy loss:    N)r	   discounted_returnr4   discountrG   rC   
gae_lambdalambda_returnr'   nnmomentssqrtPrintr   scanr   update_epochsr   rj   )rZ   r6   r:   r   r   r8   r   return_rC   rv   variance
value_losspolicy_lossrz   print_lossesr   s   ```````        @r    r   "PPOAlgorithm._perform_update_steps  s   $ ''8M8MNGMM&&)//E||''vt||?T?T(,(?(?Ai /iUU]]9Aq6T]JNDT!bggh&7$&>?I..12>>%3HIK_aIR^^I%>$?AYZI')ww 0I 0I/1xx8R8R/SUaCD(F$JW 88BHHQ
)C(DnUHHQ)D(EWYL		 	 *<!H	IT\\//145 
J	I	Is   8I
I,c                    U R                  XU5      u  pU R                  X5      n
U R                  U
R                  U
R                  UXBXg5      u  p[        U R                  R                  U5      6 u  p[        U R                  R                  U5      6 u  nnX-   nUU-   nU R                  R                  [        UU5      5      n[        R                  R                  X[        R                  R                  S[        R                  " U5      5      [        R                  R                  S[        R                  " U5      5      [        R                  " [        X5      [!        SS95      [        R                  " [        UU5      [!        SS95      /5      n[        R"                  " U/5         XU4 Vs/ s H  n[        R$                  " U5      PM     snsSSS5        $ s  snf ! , (       d  f       g= f)a  Compute the current combined loss and perform a gradient update step.

Args:
  observ: Sequences of observations.
  action: Sequences of actions.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  reward: Sequences of reward.
  advantage: Sequences of advantages.
  length: Batch of sequence lengths.

Returns:
  Tuple of value loss, policy loss, and summary tensor.
value_gradient_normpolicy_gradient_normz.*)rC   )r   N)_value_lossrG   _policy_lossrv   r~   ziprY   compute_gradientsapply_gradientsr'   rz   r{   r   global_normr	   gradient_summariesdictrj   r   )rZ   r6   r:   r   r   r8   r   r   r   value_summaryr   r   policy_summaryvalue_gradientsvalue_variablespolicy_gradientspolicy_variablesall_gradientsall_variablesoptimizerz   r   s                         r    r   PPOAlgorithm._update_step:  s    !% 0 0 HJmmF+G"&"3"3GLL'..RZ4>	#[K(+T__-N-Nz-Z([$O*-t/P/PQ\/]*^&&#6M#&66M..s=-/PQHjj


/1PQ


0"..AQ2RS""3#H$UZJ[\""3'79I#JDX]L^_  G 
	 	 (	,'1&HI&Hbkk!n&HI 
-	,I 
-	,s   0G(7 G#G(#G((
G6c           
      h   [         R                  " S5         U R                  X5      R                  n[        R
                  " X#U R                  R                  5      nXT-
  nSU R                  US-  U5      -  n[         R                  R                  [         R                  R                  SU5      [         R                  R                  S[         R                  " U5      5      /5      n[         R                  " U5      n[         R                  " US5      U4sSSS5        $ ! , (       d  f       g= f)an  Compute the loss function for the value baseline.

The value loss is the difference between empirical and approximated returns
over the collected episodes. Returns the loss tensor and a summary strin.

Args:
  observ: Sequences of observations.
  reward: Sequences of reward.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.
r   g      ?r   avg_value_lossN)r'   rf   rG   rC   r	   r   r4   r   _maskrz   r{   r|   r   r   r   )	rZ   r6   r8   r   rC   r   r   r   rz   s	            r    r   PPOAlgorithm._value_loss\  s     
|	$mmF+11e))&$,,:O:OPg/iIqL&99j

  
**

|Z
8
**

,bnnZ.H
I" 	g >>*-jz<8'A 
%	$	$s   DD##
D1c                   ^ [         R                  " S5         [        R                  " X5      n[         R                  " U R                  [        R                  " X4X5      U5      S5      n	[         R                  " [        R                  " XU5      [        R                  " X4U5      -
  5      n
[         R                  " U R                  U
[         R                  " U5      -  U5      S5      * nU R                  U	-  nU R                  R                  U R                  R                  -  n[         R                  " [         R                  " X:  [         R                   5      5      m[         R"                  " [         R$                  " TS:  U4S j[&        5      /5         U R                  R(                  [         R                  " X:  [         R*                  5      -  X-
  S-  -  nSSS5        X-   W-   n[         R,                  R/                  [         R,                  R1                  SU5      [         R,                  R1                  SU	5      [         R,                  R1                  S	U5      [         R,                  R1                  S
U5      [         R,                  R1                  SU5      [         R,                  R1                  SX-   5      [         R,                  R1                  SU5      [         R,                  R3                  S[         R                  " U5      5      [         R,                  R3                  S[         R                  " U5      5      [         R,                  R3                  S[         R                  " U5      5      /
5      n[         R                  " US5      n[         R4                  " US5      U4sSSS5        $ ! , (       d  f       GN= f! , (       d  f       g= f)aT  Compute the policy loss composed of multiple components.

1. The policy gradient loss is importance sampled from the data-collecting
   policy at the beginning of training.
2. The second term is a KL penalty between the policy at the beginning of
   training and the current policy.
3. Additionally, if this KL already changed more than twice the target
   amount, we activate a strong penalty discouraging further divergence.

Args:
  mean: Sequences of action means of the current policy.
  logstd: Sequences of action log stddevs of the current policy.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  action: Sequences of actions.
  advantage: Sequences of advantages.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.
r   r   r   c                  6   > [         R                  " ST /S5      $ )Nr   zkl cutoff! )r'   r   )cutoff_counts   r    r!   +PPOAlgorithm._policy_loss.<locals>.<lambda>  s    RXXa,-Wr#   r   Nentropyklsurrogate_loss
kl_penalty	kl_cutoffkl_penalty_combinedavg_surr_lossavg_kl_penaltyavg_policy_loss)r'   rf   r	   diag_normal_entropyr   r   diag_normal_klr}   diag_normal_logpdfstop_gradientrV   r4   	kl_targetkl_cutoff_factor
reduce_sumcastint32rj   r   intkl_cutoff_coefrU   rz   r{   r|   r   r   )rZ   rv   r~   r   r   r:   r   r   r  r  policy_gradientr  r  cutoff_thresholdr	  r   rz   r  s                    @r    r   PPOAlgorithm._policy_lossv  s   , 
}	%++D9g>>
**W++H$OQW
XZ[]b

$
$T6
:

$
$X6
BCDo 
**_r'7'7	'BBF
KQP Pn==2%j//$,,2O2OO]]2772+@"((#KLl""77<!#%WY\]
^`\\002772;PRTR\R\3]]+a/0	` #/);k

  
**

y'
2
**

tR
(
**

/
@
**

|Z
8
**

{I
6
**

4j6L
M
**

}k
:
**

OR^^N-K
L
**

,bnnZ.H
I
**

-r~~k/J
K" 	g NN;2k{M:GC= 
&	%` ` 
&	%s&   FOAN>#GO>
O	O
Oc                   ^  [         R                  " S5         T R                  X5      n[         R                  " [         R                  " [         R
                  " UR                  U5      5      SSS9n[         R                  " ST R                  /S5      n[         R                  " Xg/5         [         R                  " T R                  [        R                  " X#UR                  UR                  5      U5      5      n[         R                  " X/S5      n[         R                  " UST R                   R"                  -  :  U 4S	 j[$        5      n	[         R                  " US
T R                   R"                  -  :  U 4S j[$        5      n
SSS5        [         R                  " W	W
/5         [         R&                  R)                  [         R&                  R+                  SW5      [         R&                  R+                  ST R                  5      /5      sSSS5        sSSS5        $ ! , (       d  f       N= f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)a  Adjust the KL policy between the behavioral and current policy.

Compute how much the policy actually changed during the multiple
update steps. Adjust the penalty strength for the next training phase if we
overshot or undershot the target divergence too much.

Args:
  observ: Sequences of observations.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  length: Batch of sequence lengths.

Returns:
  Summary tensor.
adjust_penaltyFzpolicy should change)messager   zcurrent penalty: zkl change: g?c                     > [         R                  " T R                  R                  T R                  S-  5      S/S5      $ )N      ?r   zincrease penalty r'   r   rV   r   rZ   s   r    r!   .PPOAlgorithm._adjust_penalty.<locals>.<lambda>  -    BHHT]]11$--#2EFM`ar#   gffffff?c                     > [         R                  " T R                  R                  T R                  S-  5      S/S5      $ )Nr   r   zdecrease penalty r!  r"  s   r    r!   r#    r$  r#   N	kl_changepenalty)r'   rf   rG   r   
reduce_allequalrv   r   rV   rj   r   r   r	   r  r~   r   r4   r  floatrz   r{   r   )rZ   r6   r   r   r   r   assert_changeprint_penaltyr&  maybe_increasemaybe_decreases   `          r    r   PPOAlgorithm._adjust_penalty  s     
'	(f-goobmmBHHW\\84T&U&+.DFm hhq4==/3FGm""M#ABNNJJw--hGLLRYR`R`a 	 HHY]C	dll4444a	
 dll4444a	 C ""NN#CDzzJJk95JJi7!
  ED+ 
)	( CB EDD+ 
)	(	(s>   BI-CH$
 I*A'H5	I$
H2	.I5
I	?I
Ic                 `   [         R                  " S5         [         R                  " UR                  S   R                  5      n[         R
                  " USSS24   USS2S4   :  [         R                  5      nX-  n[         R                  " US5      sSSS5        $ ! , (       d  f       g= f)zSet padding elements of a batch of sequences to zero.

Useful to then safely sum along the time dimension.

Args:
  tensor: Tensor of sequences.
  length: Batch of sequence lengths.

Returns:
  Masked sequences.
maskr   Nmasked)r'   rf   r   r   rC   r  rU   r   )rZ   tensorr   range_r1  r2  s         r    r   PPOAlgorithm._mask  sx     
v	xxQ--.fWWVD!G_vag6

Cd}fvx0	 
		s   A>B
B-)r0   r4   rJ   r2   rQ   rS   rR   rL   r>   r?   rG   r7   rY   rV   r9   r3   r1   N)__name__
__module____qualname____firstlineno____doc__rb   ro   r   r   r   r   r   r   r   r   r   r   r   r   __static_attributes__ r#   r    r   r   $   sX    J<I|$'OR2:^"7S:&6P JDB44Dl)V1r#   r   )r:  
__future__r   r   r   rE   tensorflow.compat.v1compatv1r'   	Exception
tensorflowre   r   r   r	   objectr   r<  r#   r    <module>rD     sP    '  % ##   C16 C1  s   	A 
AA