-
Notifications
You must be signed in to change notification settings - Fork 48
/
reinforce_w_baseline.py
98 lines (80 loc) · 3.3 KB
/
reinforce_w_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Policy Gradient Agent
# - REINFORCE algorithm with baseline
# - Policy/value function approximation
#
# ---
# @author Yiren Lu
# @email luyiren [at] seas [dot] upenn [dot] edu
#
# MIT License
import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
import tf_utils
class PolicyGradientNNAgent():
def __init__(self,
lr=0.5,
gamma=0.99,
state_size=4,
action_size=2,
n_hidden_1=20,
n_hidden_2=20,
scope="pg"
):
"""
args
epsilon exploration rate
epsilon_anneal linear decay rate per call of learn() function (iteration)
end_epsilon lowest exploration rate
lr learning rate
gamma discount factor
state_size network input size
action_size network output size
"""
self.lr = lr
self.gamma = gamma
self.state_size = state_size
self.action_size = action_size
self.total_steps = 0
self.n_hidden_1 = n_hidden_1
self.n_hidden_2 = n_hidden_2
self.scope = scope
self._build_policy_net()
def _build_policy_net(self):
"""Build policy network"""
with tf.variable_scope(self.scope):
self.state_input = tf.placeholder(tf.float32, [None, self.state_size])
self.action = tf.placeholder(tf.int32, [None])
self.target = tf.placeholder(tf.float32, [None])
layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu)
layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu)
self.value = tf_utils.fc(layer_2, 1)
self.action_values = tf_utils.fc(layer_2, self.action_size)
action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
self.action_value_pred = tf.reduce_sum(tf.nn.softmax(self.action_values) * action_mask, 1)
self.action_probs = tf.nn.softmax(self.action_values)
self.value_loss = tf.reduce_mean(tf.square(self.target - self.value))
self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * (self.target - self.value))
self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() ])
self.loss = self.pg_loss + 5*self.value_loss + 0.002 * self.l2_loss
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
def get_action(self, state, sess):
"""Returns stochastic policy"""
pi = self.get_policy(state, sess)
return np.random.choice(range(self.action_size), p=pi)
def get_policy(self, state, sess):
"""returns policy as probability distribution of actions"""
pi = sess.run(self.action_probs, feed_dict={self.state_input: [state]})
return pi[0]
def learn(self, episode, sess, train_epoch = 1):
for t in xrange(len(episode)):
self.total_steps = self.total_steps + 1
target = sum([self.gamma**i * r for i, (s, a, s1, r, d) in enumerate(episode[t:])])
state, action, next_state, reward, done = episode[t]
feed_dict = { self.state_input: [state], self.target: [target], self.action: [action] }
_, loss, v, pg_loss, v_a = sess.run([self.train_op, self.loss, self.value, self.pg_loss, self.action_value_pred], feed_dict)
# print target, v
# print pg_loss, v, v_a, target, -np.log(v_a) * target