强化学习 - Actor Critic
基于Actor Critic算法使用TensorFlow实现Cart-Pole
Actor Critic算法是在Policy Gradient基础上加入评分系统,并与真实的价值做比较,然后优化
模型也是比较简单的,两层隐藏层足矣,唯一区分的就是输出层一个输出所有action的概率,一个输出一个标量,即当前action的评价分数
首先引入gym-CartPole
import gym
env = gym.make('CartPole-v0').unwrapped
env.seed(args.seed)
# state维度
IN_DIM = env.observation_space.shape[0]
# action维度
OUT_DIM = env.action_space.n
然后定义模型
import tensorflow as tf
import numpy as np
tf.set_random_seed(args.seed)
class ActorCritic(object):
def __init__(self, in_dim, out_dim, h_dim):
self.global_step = tf.train.get_or_create_global_step()
with tf.variable_scope('init_variables'):
self.state = tf.placeholder(
tf.float32, [None, in_dim], name="state")
self.rewards = tf.placeholder(
tf.float32, [None], name="rewards")
self.selected_actions = tf.placeholder(
tf.float32, [None], name="actions")
self.td_error = tf.placeholder(
tf.float32, [None], name="td_error")
# 模型结构,激活函数使用Relu
with tf.variable_scope('init_layers'):
h_layer = tf.keras.layers.Dense(h_dim, activation=tf.nn.relu)
action_layer = tf.keras.layers.Dense(
out_dim, activation=tf.nn.softmax)
value_layer = tf.keras.layers.Dense(1)
with tf.variable_scope('init_graph'):
hidden = h_layer(self.state)
props = action_layer(hidden)
self.value = tf.reshape(value_layer(hidden), [-1])
dist = tf.distributions.Categorical(props)
self.action = dist.sample()
self.log_scores = dist.log_prob(self.selected_actions)
with tf.variable_scope('loss'):
value_loss = self._smooth_l1_loss(self.value, self.rewards)
action_loss = -tf.reduce_sum(self.log_scores * self.td_error)
self.train_op = tf.train.AdamOptimizer(
args.lr).minimize(value_loss + action_loss, global_step=self.global_step)
# tf没有现成的smooth_l1_loss,自己写一个
def _smooth_l1_loss(self, value, rewards):
thres = tf.constant(1, dtype=tf.float32)
mae = tf.abs(value - rewards)
loss = tf.keras.backend.switch(tf.greater(
mae, thres), (mae - 0.5), 0.5 * tf.pow(mae, 2))
return tf.reduce_sum(loss)
最后训练
ac = ActorCritic(IN_DIM, OUT_DIM, args.hidden_dim)
def train():
init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
more = 0
saver = tf.train.Saver()
saver_hook = tf.train.CheckpointSaverHook(
'./train/', save_steps=2, saver=saver)
summary_op = tf.summary.scalar('value', 1)
summary_hook = tf.train.SummarySaverHook(save_steps=2,
summary_op=summary_op)
with tf.train.MonitoredTrainingSession(config=config, hooks=[saver_hook, summary_hook]) as sess:
for epoch in count(1):
state = env.reset()
if args.render:
env.render()
states, policy_rewards, actions, values = [
state], [], [], []
for step in range(10000):
action, value = sess.run([ac.action, ac.value], feed_dict={
ac.state: [state]})
action, value = action[0], value[0]
state, reward, done, _ = env.step(action)
policy_rewards.append(reward)
actions.append(action)
values.append(value)
if done:
break
states.append(state)
R, rewards = 0, []
for r in policy_rewards[::-1]:
R = r + args.gamma * R
rewards.insert(0, R)
rewards = np.asarray(rewards)
rewards = (rewards - rewards.mean()) / \
(rewards.std() + np.finfo(np.float32).eps)
values = np.asarray(values)
feed_dict = {
ac.state: np.asarray(states),
ac.rewards: rewards,
ac.selected_actions: np.asarray(actions),
ac.td_error: (rewards - values),
}
sess.run([ac.global_step, ac.train_op], feed_dict)
if more < step:
print('Epoch {}\tlength: {:5d}\t'.format(epoch, step))
more = step