-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathac_net.py
85 lines (71 loc) · 3.39 KB
/
ac_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''Actor-critic network class for a3c'''
import numpy as np
import tensorflow as tf
import tf_utils
class AC_Net(object):
'''Actor-critic network class for a3c'''
def __init__(self, state_size, action_size, lr,
name, n_h1=400, n_h2=300, global_name='global'):
self.state_size = state_size
self.action_size = action_size
self.name = name
self.n_h1 = n_h1
self.n_h2 = n_h2
self.optimizer = tf.train.AdamOptimizer(lr)
self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network(
name)
# 0.5, 0.2, 1.0
self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy))
self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage)
self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables])
# self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss
self.loss = self.value_loss + self.policy_loss + self.entropy_loss
self.gradients = tf.gradients(self.loss, self.model_variables)
if name != global_name:
self.var_norms = tf.global_norm(self.model_variables)
global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name)
self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables))
def _build_network(self, name):
input_s = tf.placeholder(tf.float32, [None, self.state_size])
input_a = tf.placeholder(tf.int32, [None])
advantage = tf.placeholder(tf.float32, [None])
target_v = tf.placeholder(tf.float32, [None])
with tf.variable_scope(name):
layer_1 = tf_utils.fc(
input_s,
self.n_h1,
scope="fc1",
activation_fn=tf.nn.relu,
initializer=tf.contrib.layers.variance_scaling_initializer(
mode="FAN_IN"))
layer_2 = tf_utils.fc(
layer_1,
self.n_h2,
scope="fc2",
activation_fn=tf.nn.relu,
initializer=tf.contrib.layers.variance_scaling_initializer(
mode="FAN_IN"))
policy = tf_utils.fc(
layer_2,
self.action_size,
activation_fn=tf.nn.softmax,
scope="policy",
initializer=tf_utils.normalized_columns_initializer(0.01))
value = tf_utils.fc(layer_2, 1, activation_fn=None,
scope="value", initializer=tf_utils.normalized_columns_initializer(1.0))
action_mask = tf.one_hot(input_a, self.action_size, 1.0, 0.0)
action_est = tf.reduce_sum(policy * action_mask, 1)
model_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
return input_s, input_a, advantage, target_v, policy, value, action_est, model_variables
def get_action(self, state, sess):
state = np.reshape(state, [-1, self.state_size])
policy = sess.run(self.policy, feed_dict={self.input_s: state})
return np.random.choice(range(self.action_size), p=policy[0])
def predict_policy(self, state, sess):
state = np.reshape(state, [-1, self.state_size])
policy = sess.run(self.policy, feed_dict={self.input_s: state})
return policy[0]
def predict_value(self, state, sess):
state = np.reshape(state, [-1, self.state_size])
return sess.run(self.value, feed_dict={self.input_s: state})