
    niJ`                         S r SSKJr  SSKJr  SSKJr  SSKrSSKJs  Jr	  SSK
Jr  SSK
Jr  SS	K
Jr  \R                  " S
S5      r " S S\5      rg)zProximal Policy Optimization algorithm.

Based on John Schulman's implementation in Python and Theano:
https://github.com/joschu/modular_rl/blob/master/modular_rl/ppo.py
    )absolute_import)division)print_functionN   )memory)	normalize)utilityNetworkOutputz"policy, mean, logstd, value, statec                       \ rS rSrSrS rS rS rS rS r	S r
S	 rS
 rS rS rS rS rS rS rS rS rSS jrSrg)PPOAlgorithm#   zBA vectorized implementation of the PPO algorithm by John Schulman.c           	         Xl         X l        X0l        X@l        XPl        [
        R                  " U R                   R                  S   SSSSS9U l        [
        R                  " U R                   R                  S   SSSSS9U l
        U R                   R                  S   U R                   R                  S   U R                   R                  S   U R                   R                  S   U R                   R                  S   4n[        R                  " XeR                  UR                  S	5      U l        ["        R$                  " SS5      U l        U R                  R(                  =(       a    [*        R,                  " 5       n["        R.                  " U(       a  S
OS5         U R1                  ["        R2                  " U R                   R                  5      SS2S4   ["        R4                  " [7        U R                   5      5      SS9  U R                  R9                  U R                   R                  R:                  S   R<                  5      n["        R>                  " S5         [        R                  " U[7        U5      UR                  S5      U l         [*        RB                  " URE                  [7        U5      ["        RF                  5      5      U l$        ["        R$                  " ["        R2                  " U R                   R                  5      SSS9U l%        ["        R$                  " ["        R2                  " U R                   R                  5      SSS9U l&        ["        R$                  " ["        R2                  " U R                   R                  5      SSS9U l'        SSS5        SSS5        ["        R$                  " U R                  RP                  S["        RF                  S9U l)        U R                  RU                  U R                  RV                  SS9U l,        U R                  R[                  U R                  R\                  SS9U l/        g! , (       d  f       N= f! , (       d  f       N= f)ac  Create an instance of the PPO algorithm.

Args:
  batch_env: In-graph batch environment.
  step: Integer tensor holding the current training step.
  is_training: Boolean tensor for whether the algorithm should train.
  should_log: Boolean tensor for whether summaries should be returned.
  config: Object containing the agent configuration as attributes.
r   T   normalize_observ)centerscaleclipnameF
   normalize_rewardr   /gpu:0/cpu:0Nreuser   ppo_temporaryepisodeslast_action)r   	last_meanlast_logstd)dtypepolicy_optimizervalue_optimizer)0
_batch_env_step_is_training_should_log_configr   StreamingNormalizeobserv_observ_filterreward_reward_filteractionr   EpisodeMemoryupdate_every
max_length_memorytfVariable_memory_indexuse_gpur	   available_gpusdevice_network
zeros_likeoneslennetworkshapevaluevariable_scope	_episodescreate_nested_vars
zero_statefloat32_last_state_last_action
_last_mean_last_logstdkl_init_penalty_penaltyr!   	policy_lr_policy_optimizerr"   value_lr_value_optimizer)	self	batch_envstepis_training
should_logconfigtemplater5   cells	            e/home/james-whalen/.local/lib/python3.13/site-packages/pybullet_envs/minitaur/agents/ppo/algorithm.py__init__PPOAlgorithm.__init__&   s;     OJ#!L#66t7M7Ma7P>B=A<=<N	PD
 $66t7M7Ma7P>C=A<><N	PD &&q)4??+A+A!+DdooF\F\]^F_&&q)4??+A+A!+DFH''2E2EvGXGXZbcDLQ.Dll""?w'='='?G	w8H	5
mmBMM$//"8"89!T'BGGC01    \\!!$//"8"8">">q"A"G"GHd_---hIHYHY.8:"55dooc)nVXV`V`6abKKdoo6L6L(M(--:< ++bmmDOO4J4J&K&++68 KKdoo6L6L(M(--:< . 
6& KK < <e2::VDM!\\::4<<;Q;Q@R ; TD LL889N9N>O 9 QD! .- 
6	5s&   CQ%#D=Q Q%
Q"	Q%%
Q3c                    [         R                  " S5         [        R                  " U R                  U5      nU R
                  R                  U5      n[         R                  " X#/5         [         R                  " S5      sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zReset the recurrent states and stored episode.

Args:
  agent_indices: 1D tensor of batch indices for agents starting an episode.

Returns:
  Summary tensor.
zbegin_episode/ N)	r2   
name_scoper	   reinit_nested_varsrD   r@   clearcontrol_dependenciesconstant)rN   agent_indicesreset_statereset_buffers       rV   begin_episodePPOAlgorithm.begin_episode^   s     
'	(..t/?/?Ok^^))-8l""K#>?{{2 @? 
)	( @?? 
)	(	(s$   AB/+B	B/
B"	B//
B=c                 Z  ^^^ [         R                  " S5         U R                  R                  U5      nU R	                  USS2S4   [         R
                  " UR                  S   5      U R                  5      m[         R                  " U R                  TR                  R                  U4S j5      mTR                  R                  T5      SS2S4   m[         R                  " U R                  UUU4S j[        5      n[         R                  " [         R"                  " U R                  TR$                  5      U R&                  R)                  TSS2S4   5      U R*                  R)                  TR,                  SS2S4   5      U R.                  R)                  TR0                  SS2S4   5      /5         [         R2                  " TSS2S4   S5      [         R4                  " U5      4sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zCompute batch of actions and a summary for a batch of observation.

Args:
  observ: Tensor of a batch of observations for all agents.

Returns:
  Tuple of action batch tensor and summary tensor.
zperform/Nr   c                     > T R                   $ N)mean)r<   s   rV   <lambda>&PPOAlgorithm.perform.<locals>.<lambda>y   s	        c                    > [         R                  R                  [         R                  R                  STR                  S S 2S4   5      [         R                  R                  S[         R
                  " TR                  S S 2S4   5      5      [         R                  R                  ST S S 2S4   5      [         R                  R                  ST5      /5      $ )Nrh   r   stdr-   logprob)r2   summarymerge	histogramrh   explogstd)r-   rn   r<   s   rV   ri   rj   }   s    BJJ$4$4jj""67<<1+=>jj""5"&&11E*FGjj""8VAqD\:jj""9g6	6 %rk   r-   )r2   r[   r*   	transformr8   r:   r=   rD   condr%   policysamplelog_probr&   strr^   r	   assign_nested_varsstaterE   assignrF   rh   rG   rs   check_numericsidentity)rN   r)   ro   r-   rn   r<   s      @@@rV   performPPOAlgorithm.performm   s    
z	""",,V4ffQWorwwv||A/GIYIYZgwwt(('..*?*?AUVf''/15g


 
 g ""

$
$T%5%5w}}
E



"
"6!Q$<
0
//
 
 ad!3
4



"
"7>>!Q$#7
8	$ 	   1x8"++g:NN	 	 
#	"	 	 	 
#	"	"s$   F H:4H.	H
H	H
H*c                    ^ ^^^ [         R                  " S5         [         R                  " T R                  UUUU 4S j[        5      sSSS5        $ ! , (       d  f       g= f)a  Process the transition tuple of the current step.

When training, add the current transition tuple to the memory and update
the streaming statistics for observations and rewards. A summary string is
returned if requested at this step.

Args:
  observ: Batch tensor of observations.
  action: Batch tensor of actions.
  reward: Batch tensor of rewards.
  unused_done: Batch tensor of done flags.
  unused_nextob: Batch tensor of successor observations.

Returns:
  Summary tensor.
zexperience/c                  *   > TR                  TT T5      $ rg   )_define_experience)r-   r)   r+   rN   s   rV   ri   )PPOAlgorithm.experience.<locals>.<lambda>   s    0G0GPVX^0_rk   Nr2   r[   ru   r%   ry   )rN   r)   r-   r+   unused_doneunused_nextobs   ````  rV   
experiencePPOAlgorithm.experience   s7    " 
}	%WWT&&(_ 
&	%	%s   -A
A c           	      p  ^ ^^^	 [         R                  R                  T R                  R	                  U5      T R
                  R	                  U5      /5      m	[         R                  " T	/5         T R                  R                  (       a  T R                  nXT R                  T R                  U4nT R                  R                  U[         R                  " [        T R                   5      5      5      nSSS5        [         R                  " W/5         T R                  R#                  U5      m[         R$                  " T R
                  R#                  U5      5      m[         R&                  " T R(                  UUU U	4S j[*        5      nUsSSS5        $ ! , (       d  f       N= f! , (       d  f       g= f)z=Implement the branch of experience() entered during training.Nc                    > [         R                  R                  TTR                  R                  5       TR                  R                  5       [         R                  R                  STR                  5      [         R                  R                  ST 5      [         R                  R                  STR                  5      [         R                  R                  ST5      /5      $ )Nmemory_sizenormalized_observr-   normalized_reward)	r2   ro   rp   r*   r,   scalarr4   rq   rE   )norm_observnorm_rewardrN   update_filterss   rV   ri   1PPOAlgorithm._define_experience.<locals>.<lambda>   s    BJJ$4$4!!))+!!))+jjt/A/ABjj""#6Djj""8T->->?jj 3[A6 %rk   )r2   ro   rp   r*   updater,   r^   r'   train_on_agent_actionrE   rF   rG   r@   appendranger;   r#   rt   reduce_meanru   r&   ry   )
rN   r)   r-   r+   batchr   ro   r   r   r   s
   `      @@@rV   r   PPOAlgorithm._define_experience   s;   ZZ%%				#	#F	+				#	#F	+	-.N 
	 	 .!1	2		+	+""doot/@/@&He~~$$UBHHS5I,JKf 
3 
	 	 &	*''11&9kNN4#6#6#@#@#HIk


  	g  
+	* 
3	2 
+	*s   /BFA9F'
F$'
F5c                    ^ ^ [         R                  " S5         [         R                  " T R                  UU 4S j[        5      sSSS5        $ ! , (       d  f       g= f)a  Add episodes to the memory and perform update steps if memory is full.

During training, add the collected episodes of the batch indices that
finished their episode to the memory. If the memory is full, train on it,
and then clear the memory. A summary string is returned if requested at
this step.

Args:
  agent_indices: 1D tensor of batch indices for agents starting an episode.

Returns:
   Summary tensor.
zend_episode/c                  &   > TR                  T 5      $ rg   )_define_end_episode)r`   rN   s   rV   ri   *PPOAlgorithm.end_episode.<locals>.<lambda>   s    0H0H0Wrk   Nr   )rN   r`   s   ``rV   end_episodePPOAlgorithm.end_episode   s5     
~	&WWT&&(WY\] 
'	&	&s   +A
Ac                    U R                   R                  U5      u  p#U R                  R                  U R                  -
  n[
        R                  " [
        R                  " [
        R                  " U5      S   U5      5      nU Vs/ s H  n[
        R                  " Xe5      PM     nnU R                  R                  U[
        R                  " X55      XPR                  -   5      n[
        R                  " U/5         U R                  R                  [
        R                  " U5      S   5      nSSS5        [
        R                  " W/5         U R                  U R                  R                  :  n	[
        R                  " XR                  [         5      sSSS5        $ s  snf ! , (       d  f       N|= f! , (       d  f       g= f)z>Implement the branch of end_episode() entered during training.r   N)r@   datar'   r/   r4   r2   r   minimumr=   gatherr1   replacer^   
assign_addru   	_trainingry   )
rN   r`   r   length
space_leftuse_episodeselemr   	inc_indexmemory_fulls
             rV   r    PPOAlgorithm._define_end_episode   s8   ~~**=9H**T-?-??J88BJJrxx'>q'A:NOL:BC($		$-(HC\\!!(BIIf,K".1C1C"CEF		 	 &	*$$//0Fq0IJi 
+		 	 )	-&&$,,*C*CCkWW[..#6 
.	- D 
+	*	-	-s    F!=3F&AF7&
F47
Gc           	         [         R                  " S5         [         R                  " U R                  U R                  R
                  5      n[         R                  " U/5         U R                  R                  5       nSSS5        Wu  u  p4pVpx[         R                  " [         R                  " US5      /5         [         R                  " U5      nSSS5        U R                  R                  U5      nU R                  R                  U5      nU R                  X4XVXx5      n	[         R                  " U	/5         U R                  X7U5      n
SSS5        [         R                  " W
/5         U R!                  X5Xh5      nSSS5        [         R                  " W/5         [         R"                  " U R                  R%                  5       U R                  R'                  S5      5      nSSS5        [         R                  " W/5         [(        R*                  " [         R,                  " 5       U R                  R.                  5      n[         R0                  R3                  XX/5      sSSS5        sSSS5        $ ! , (       d  f       GN = f! , (       d  f       GN= f! , (       d  f       GNV= f! , (       d  f       GN6= f! , (       d  f       N= f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)zPerform multiple training iterations of both policy and value baseline.

Training on the episodes collected in the memory. Reset the memory
afterwards. Always returns a summary string.

Returns:
  Summary tensor.
trainingNr   )r2   r[   assert_equalr4   r'   r/   r^   r1   r   assert_greaterr~   r*   rt   r,   _update_policy_update_value_adjust_penaltygroupr]   r|   r	   variable_summariestrainable_variablesweight_summariesro   rp   )rN   assert_fullr   r)   r-   old_mean
old_logstdr+   r   policy_summaryvalue_summarypenalty_summaryclear_memoryweight_summarys                 rV   r   PPOAlgorithm._training   s    
z	"OOD$6$68Q8QRk""K=1||  " 2?C<4vxV""B$5$5fa$@#ABV$ C"",,V4f"",,V4f**68QW`n""N#34**66B 5""M?3..vT 4""O#45xx 2 2 4d6H6H6O6OPQ6RS 6""L>2 33B4J4J4L48LL4Q4QSzz `a 32! 
#	"11 CB
 543355222! 
#	"	"s   AK,$I8?<K,;J
A(K,:JK,,J.?K,A	K 'K,AK%	K,8
J	K,

J	K,
J+	&K,.
J=	8K, 
K	
K,
K	K,,
K:c                 
  ^ ^^^ [         R                  " S5         [         R                  " UUUU 4S j[         R                  " T R                  R
                  5      SS/SS9u  pE[         R                  " S[         R                  " U5      /S5      n[         R                  " XF/5         UT R                  R
                  S	-     sS
S
S
5        sS
S
S
5        $ ! , (       d  f       O= f S
S
S
5        g
! , (       d  f       g
= f)a,  Perform multiple update steps of the value baseline.

We need to decide for the summary of one iteration, and thus choose the one
after half of the iterations.

Args:
  observ: Sequences of observations.
  reward: Sequences of reward.
  length: Batch of sequence lengths.

Returns:
  Summary tensor.
update_valuec                 *   > TR                  TTT5      $ rg   )_update_value_step)_1_2r   r)   r+   rN   s     rV   ri   ,PPOAlgorithm._update_value.<locals>.<lambda>  s    T-D-DVVU[-\rk           rZ   r   parallel_iterationsr   zvalue loss:    N)	r2   r[   scanr   r'   update_epochs_valuePrintr   r^   )rN   r)   r+   r   lossro   
print_losss   ````   rV   r   PPOAlgorithm._update_value   s     
~	&gg\ hht||'G'GH2r(235md 88At 45~Fj""D#56t||771<= 76 
'	&
 766 
'	&	&s$   BC4*C	C4
C'	#C44
Dc                 f   U R                  XU5      u  pE[        U R                  R                  U5      6 u  pgU R                  R	                  [        Xg5      5      n[
        R                  R                  U[
        R                  R                  S[
        R                  " U5      5      [        R                  " [        Xg5      [        SS95      /5      n[
        R                  " U/5         [
        R                  " U5      [
        R                  " U5      /sSSS5        $ ! , (       d  f       g= f)zCompute the current value loss and perform a gradient update step.

Args:
  observ: Sequences of observations.
  reward: Sequences of reward.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.
gradient_norm.*)r>   N)_value_lossziprM   compute_gradientsapply_gradientsr2   ro   rp   r   global_normr	   gradient_summariesdictr^   r~   )	rN   r)   r+   r   r   ro   	gradients	variablesoptimizes	            rV   r   PPOAlgorithm._update_value_step  s     $$VV<MD!6!6!H!H!NOI$$44S5NOHjj


/2>>)+DE""3y#<d>OP  G
 
	 	 (	,kk$W!56 
-	,	,s   ,,D""
D0c           
      h   [         R                  " S5         U R                  X5      R                  n[        R
                  " X#U R                  R                  5      nXT-
  nSU R                  US-  U5      -  n[         R                  R                  [         R                  R                  SU5      [         R                  R                  S[         R                  " U5      5      /5      n[         R                  " U5      n[         R                  " US5      U4sSSS5        $ ! , (       d  f       g= f)an  Compute the loss function for the value baseline.

The value loss is the difference between empirical and approximated returns
over the collected episodes. Returns the loss tensor and a summary strin.

Args:
  observ: Sequences of observations.
  reward: Sequences of reward.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.

value_lossg      ?r   avg_value_lossN)r2   r[   r8   r>   r	   discounted_returnr'   discount_maskro   rp   rq   r   r   r}   )	rN   r)   r+   r   r>   return_	advantager   ro   s	            rV   r   PPOAlgorithm._value_loss%  s     
|	$mmF+11e))&$,,:O:OPg/iIqL&99j

  
**

|Z
8
**

,bnnZ.H
I" 	g >>*-jz<8'A 
%	$	$s   DD##
D1c           
        ^ ^^^^^^ [         R                  " S5         [        R                  " UTT R                  R
                  5      nT R                  TT5      R                  nT R                  R                  (       aB  [        R                  " XXTT R                  R
                  T R                  R                  5      mOXx-
  m[         R                  R                  TSS/SS9u  pTU	-
  [         R                  " U
5      S-   -  m[         R                  " T[         R                  " U5      [         R                  " U5      /S5      m[         R                  " T[         R                  " T5      /S5      m[         R                  " UUUUUUU 4S	 j[         R                   " T R                  R"                  5      S
S/SS9u  p[         R                  " S[         R                  " U5      /S5      n[         R$                  " X/5         UT R                  R"                  S-     sSSS5        sSSS5        $ ! , (       d  f       O= f SSS5        g! , (       d  f       g= f)a  Perform multiple update steps of the policy.

The advantage is computed once at the beginning and shared across
iterations. We need to decide for the summary of one iteration, and thus
choose the one after half of the iterations.

Args:
  observ: Sequences of observations.
  action: Sequences of actions.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  reward: Sequences of rewards.
  length: Batch of sequence lengths.

Returns:
  Summary tensor.
update_policyr   r   T)axes	keep_dimsg:0yE>zreturn and value: znormalized advantage: c                 0   > TR                  TTTTTT5      $ rg   )_update_policy_step)	r   r   r-   r   r   r)   r   r   rN   s	     rV   ri   -PPOAlgorithm._update_policy.<locals>.<lambda>_  s    T-E-E
&(J	6.Crk   r   rZ   r   zpolicy loss: r   N)r2   r[   r	   r   r'   r   r8   r>   
gae_lambdalambda_returnnnmomentssqrtr   r   r   r   update_epochs_policyr^   )rN   r)   r-   r   r   r+   r   r   r>   rh   variancer   ro   r   r   s   ````` `       @rV   r   PPOAlgorithm._update_policy?  s   $ 
	'))&&$,,:O:OPgmmFF+11e		 	 ))&AVAV*.,,*A*AC	 O	uu}}YaVt}Lndt#(9D(@Ai((
bnnW-r~~e/DEG[]i((9r~~i'@&AC[\igg C C hht||'H'HIBPR8235md 88At 45Gj""D#56t||88A=> 76' 
(	'& 766' 
(	'	's$   G,I
H9&	I9
I	I
I"c                    U R                  X5      nU R                  UR                  UR                  X4UXV5      u  p[	        U R
                  R                  U5      6 u  pU R
                  R                  [	        X5      5      n[        R                  R                  U	[        R                  R                  S[        R                  " U
5      5      [        R                  " [	        X5      [        SS95      /5      n	[        R                   " U/5         [        R"                  " U5      [        R"                  " U	5      /sSSS5        $ ! , (       d  f       g= f)a  Compute the current policy loss and perform a gradient update step.

Args:
  observ: Sequences of observations.
  action: Sequences of actions.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  advantage: Sequences of advantages.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.
r   r   )rv   N)r8   _policy_lossrh   rs   r   rK   r   r   r2   ro   rp   r   r   r	   r   r   r^   r~   )rN   r)   r-   r   r   r   r   r<   r   ro   r   r   r   s                rV   r    PPOAlgorithm._update_policy_stepg  s     mmF+G%%gllGNNHZ`&/9MD!7!7!I!I$!OPI%%55c)6OPHjj


/2>>)+DE""3y#<d%>PQ  G
 
	 	 (	,kk$W!56 
-	,	,s   ,E


Ec                   ^ [         R                  " S5         [        R                  " X5      n[         R                  " U R                  [        R                  " X4X5      U5      S5      n	[         R                  " [        R                  " XU5      [        R                  " X4U5      -
  5      n
[         R                  " U R                  U
[         R                  " U5      -  U5      S5      * nU R                  U	-  nU R                  R                  U R                  R                  -  n[         R                  " [         R                  " X:  [         R                   5      5      m[         R"                  " [         R$                  " TS:  U4S j[&        5      /5         U R                  R(                  [         R                  " X:  [         R*                  5      -  X-
  S-  -  nSSS5        X-   W-   n[         R,                  R/                  [         R,                  R1                  SU5      [         R,                  R1                  SU	5      [         R,                  R1                  S	U5      [         R,                  R1                  S
U5      [         R,                  R1                  SU5      [         R,                  R1                  SX-   5      [         R,                  R1                  SU5      [         R,                  R3                  S[         R                  " U5      5      [         R,                  R3                  S[         R                  " U5      5      [         R,                  R3                  S[         R                  " U5      5      /
5      n[         R                  " US5      n[         R4                  " US5      U4sSSS5        $ ! , (       d  f       GN= f! , (       d  f       g= f)aT  Compute the policy loss composed of multiple components.

1. The policy gradient loss is importance sampled from the data-collecting
   policy at the beginning of training.
2. The second term is a KL penalty between the policy at the beginning of
   training and the current policy.
3. Additionally, if this KL already changed more than twice the target
   amount, we activate a strong penalty discouraging further divergence.

Args:
  mean: Sequences of action means of the current policy.
  logstd: Sequences of action log stddevs of the current policy.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  action: Sequences of actions.
  advantage: Sequences of advantages.
  length: Batch of sequence lengths.

Returns:
  Tuple of loss tensor and summary tensor.
policy_lossr   r   c                  6   > [         R                  " ST /S5      $ )Nr   zkl cutoff! )r2   r   )cutoff_counts   rV   ri   +PPOAlgorithm._policy_loss.<locals>.<lambda>  s    RXXa,-Wrk   r   Nentropyklsurrogate_loss
kl_penalty	kl_cutoffkl_penalty_combinedavg_surr_lossavg_kl_penaltyavg_policy_loss)r2   r[   r	   diag_normal_entropyr   r   diag_normal_klrr   diag_normal_logpdfstop_gradientrI   r'   	kl_targetkl_cutoff_factor
reduce_sumcastint32r^   ru   intkl_cutoff_coefrC   ro   rp   rq   r   r}   )rN   rh   rs   r   r   r-   r   r   r  r  policy_gradientr  r  cutoff_thresholdr  r   ro   r   s                    @rV   r   PPOAlgorithm._policy_loss  s   , 
}	%++D9g>>
**W++H$OQW
XZ[]b

$
$T6
:

$
$X6
BCDo 
**_r'7'7	'BBF
KQP Pn==2%j//$,,2O2OO]]2772+@"((#KLl""77<!#%WY\]
^`\\002772;PRTR\R\3]]+a/0	` #/);k

  
**

y'
2
**

tR
(
**

/
@
**

|Z
8
**

{I
6
**

4j6L
M
**

}k
:
**

OR^^N-K
L
**

,bnnZ.H
I
**

-r~~k/J
K" 	g NN;2k{M:GC= 
&	%` ` 
&	%s&   FOAN>#GO>
O	O
Oc                   ^  [         R                  " S5         T R                  X5      n[         R                  " [         R                  " [         R
                  " UR                  U5      5      SSS9n[         R                  " ST R                  /S5      n[         R                  " Xg/5         [         R                  " T R                  [        R                  " X#UR                  UR                  5      U5      5      n[         R                  " X/S5      n[         R                  " UST R                   R"                  -  :  U 4S	 j[$        5      n	[         R                  " US
T R                   R"                  -  :  U 4S j[$        5      n
SSS5        [         R                  " W	W
/5         [         R&                  R)                  [         R&                  R+                  SW5      [         R&                  R+                  ST R                  5      /5      sSSS5        sSSS5        $ ! , (       d  f       N= f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)a  Adjust the KL policy between the behavioral and current policy.

Compute how much the policy actually changed during the multiple
update steps. Adjust the penalty strength for the next training phase if we
overshot or undershot the target divergence too much.

Args:
  observ: Sequences of observations.
  old_mean: Sequences of action means of the behavioral policy.
  old_logstd: Sequences of action log stddevs of the behavioral policy.
  length: Batch of sequence lengths.

Returns:
  Summary tensor.
adjust_penaltyFzpolicy should change)messager   zcurrent penalty: zkl change: g?c                     > [         R                  " T R                  R                  T R                  S-  5      S/S5      $ )N      ?r   zincrease penalty r2   r   rI   r|   rN   s   rV   ri   .PPOAlgorithm._adjust_penalty.<locals>.<lambda>  -    BHHT]]11$--#2EFM`ark   gffffff?c                     > [         R                  " T R                  R                  T R                  S-  5      S/S5      $ )Nr  r   zdecrease penalty r  r  s   rV   ri   r     r!  rk   N	kl_changepenalty)r2   r[   r8   r   
reduce_allequalrh   r   rI   r^   r   r   r	   r  rs   ru   r'   r  floatro   rp   r   )rN   r)   r   r   r   r<   assert_changeprint_penaltyr#  maybe_increasemaybe_decreases   `          rV   r   PPOAlgorithm._adjust_penalty  s     
'	(f-goobmmBHHW\\84T&U&+.DFm hhq4==/3FGm""M#ABNNJJw--hGLLRYR`R`a 	 HHY]C	dll4444a	
 dll4444a	 C ""NN#CDzzJJk95JJi7!
  ED+ 
)	( CB EDD+ 
)	(	(s>   BI-CH$
 I*A'H5	I$
H2	.I5
I	?I
Ic                 `   [         R                  " S5         [         R                  " UR                  S   R                  5      n[         R
                  " USSS24   USS2S4   :  [         R                  5      nX-  n[         R                  " US5      sSSS5        $ ! , (       d  f       g= f)zSet padding elements of a batch of sequences to zero.

Useful to then safely sum along the time dimension.

Args:
  tensor: Tensor of sequences.
  length: Batch of sequence lengths.

Returns:
  Masked sequences.
maskr   Nmasked)r2   r[   r   r=   r>   r  rC   r}   )rN   tensorr   range_r.  r/  s         rV   r   PPOAlgorithm._mask  sx     
v	xxQ--.fWWVD!G_vag6

Cd}fvx0	 
		s   A>B
B-Nc                    [         R                  " SUS9   [         R                  " U5      nU R                  R                  =(       a    [
        R                  " 5       n[         R                  " U(       a  SOS5         [         R                  " US5      nU R                  R                  U R                  R                  R                  S   R                  5      n[         R                  R                  UUUU[         R                   SS9u  u  pxpS	S	S	5        [         R                  " WS
5      n[         R                  " WS5      n[         R                  " W	S5      n	[         R"                  R$                  R'                  U[         R(                  " U5      5      n
[+        XXU5      sS	S	S	5        $ ! , (       d  f       N= f! , (       d  f       g	= f)aH  Compute the network output for a batched sequence of observations.

Optionally, the initial state can be specified. The weights should be
reused for all calls, except for the first one. Output is a named tuple
containing the policy as a TensorFlow distribution, the policy mean and log
standard deviation, the approximated state value, and the new recurrent
state.

Args:
  observ: Sequences of observations.
  length: Batch of sequence lengths.
  state: Batch of initial recurrent states.
  reuse: Python boolean whether to reuse previous variables.

Returns:
  NetworkOutput tuple.
r<   r   r   r   r)   r   T)swap_memoryNrh   rs   r>   )r2   r?   convert_to_tensorr'   r5   r	   r6   r7   r}   r<   r#   r-   r=   r>   r   dynamic_rnnrC   contribdistributionsMultivariateNormalDiagrr   _NetworkOutput)rN   r)   r   r{   r   r5   rU   rh   rs   r>   rv   s              rV   r8   PPOAlgorithm._network  sP   $ 
		9E	2##F+f$$A)?)?)Ag99Xh7""684||##DOO$:$:$@$@$C$I$IJ')uu'8'89?9?9>9;EI (9 (K$u 8 tV,d  2fw/ezz''>>tRVVF^TfF&?! 
3	2 87 
3	2s&   A(G>BF2BG2
G 	<G
G)r#   r'   r@   r%   rE   rG   rF   rD   r1   r4   r*   rI   rK   r,   r&   r$   rM   )NNT)__name__
__module____qualname____firstlineno____doc__rW   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   __static_attributes__ rk   rV   r   r   #   se    J6QpO>*6^"7b<>,7,B4&?P764Dl)V1$"@rk   r   )r@  
__future__r   r   r   collectionstf.compat.v1compatv1r2   rZ   r   r   r	   
namedtupler:  objectr   rB  rk   rV   <module>rJ     sJ    '  %      ''9]^t@6 t@rk   