-
Notifications
You must be signed in to change notification settings - Fork 0
/
ddpg_tensorflow.py
294 lines (242 loc) · 13.3 KB
/
ddpg_tensorflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# From https://github.com/heerad/gymtime/blob/master/ddpg.py
import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
import random
from collections import deque
#####################################################################################################
## Algorithm
# Deep Deterministic Policy Gradient (DDPG)
# An off-policy actor-critic algorithm that uses additive exploration noise (e.g. an Ornstein-Uhlenbeck process) on top
# of a deterministic policy to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay
# memory to update the actor (policy) and critic (Q function) parameters.
# Neural networks are used for function approximation.
# Slowly-changing "target" networks are used to improve stability and encourage convergence.
# Parameter updates are made via Adam.
# Assumes continuous action spaces!
#####################################################################################################
## Setup
env_to_use = 'Pendulum-v0'
# hyperparameters
gamma = 0.99 # reward discount factor
h1_actor = 8 # hidden layer 1 size for the actor
h2_actor = 8 # hidden layer 2 size for the actor
h3_actor = 8 # hidden layer 3 size for the actor
h1_critic = 8 # hidden layer 1 size for the critic
h2_critic = 8 # hidden layer 2 size for the critic
h3_critic = 8 # hidden layer 3 size for the critic
lr_actor = 1e-3 # learning rate for the actor
lr_critic = 1e-3 # learning rate for the critic
lr_decay = 1 # learning rate decay (per episode)
l2_reg_actor = 1e-6 # L2 regularization factor for the actor
l2_reg_critic = 1e-6 # L2 regularization factor for the critic
dropout_actor = 0 # dropout rate for actor (0 = no dropout)
dropout_critic = 0 # dropout rate for critic (0 = no dropout)
num_episodes = 15000 # number of episodes
max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit)
tau = 1e-2 # soft target update rate
train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights
replay_memory_capacity = int(1e5) # capacity of experience replay memory
minibatch_size = 1024 # size of minibatch from experience replay memory for updates
initial_noise_scale = 0.1 # scale of the exploration noise process (1.0 is the range of each action dimension)
noise_decay = 0.99 # decay rate (per episode) of the scale of the exploration noise process
exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta*(mu-Xt )*dt + sigma*dWt
# game parameters
env = gym.make(env_to_use)
state_dim = np.prod(np.array(env.observation_space.shape)) # Get total number of dimensions in state
action_dim = np.prod(np.array(env.action_space.shape)) # Assuming continuous action space
# set seeds to 0
env.seed(0)
np.random.seed(0)
# prepare monitorings
outdir = '/tmp/ddpg-agent-results'
env = wrappers.Monitor(env, outdir, force=True)
def writefile(fname, s):
with open(path.join(outdir, fname), 'w') as fh: fh.write(s)
info = {}
info['env_id'] = env.spec.id
info['params'] = dict(
gamma = gamma,
h1_actor = h1_actor,
h2_actor = h2_actor,
h3_actor = h3_actor,
h1_critic = h1_critic,
h2_critic = h2_critic,
h3_critic = h3_critic,
lr_actor = lr_actor,
lr_critic = lr_critic,
lr_decay = lr_decay,
l2_reg_actor = l2_reg_actor,
l2_reg_critic = l2_reg_critic,
dropout_actor = dropout_actor,
dropout_critic = dropout_critic,
num_episodes = num_episodes,
max_steps_ep = max_steps_ep,
tau = tau,
train_every = train_every,
replay_memory_capacity = replay_memory_capacity,
minibatch_size = minibatch_size,
initial_noise_scale = initial_noise_scale,
noise_decay = noise_decay,
exploration_mu = exploration_mu,
exploration_theta = exploration_theta,
exploration_sigma = exploration_sigma
)
np.set_printoptions(threshold=np.nan)
replay_memory = deque(maxlen=replay_memory_capacity) # used for O(1) popleft() operation
def add_to_memory(experience):
replay_memory.append(experience)
def sample_from_memory(minibatch_size):
return random.sample(replay_memory, minibatch_size)
#####################################################################################################
## Tensorflow
tf.reset_default_graph()
# placeholders
state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout
# episode counter
episodes = tf.Variable(0.0, trainable=False, name='episodes')
episode_inc_op = episodes.assign_add(1)
# will use this to initialize both the actor network its slowly-changing target network with same structure
def generate_actor_network(s, trainable, reuse):
hidden = tf.layers.dense(s, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
hidden_drop = tf.layers.dropout(hidden, rate = dropout_actor, training = trainable & is_training_ph)
hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_actor, training = trainable & is_training_ph)
hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_actor, training = trainable & is_training_ph)
actions_unscaled = tf.layers.dense(hidden_drop_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
return actions
# actor network
with tf.variable_scope('actor'):
# Policy's outputted action for each state_ph (for generating actions and training the critic)
actions = generate_actor_network(state_ph, trainable = True, reuse = False)
# slow target actor network
with tf.variable_scope('slow_target_actor', reuse=False):
# Slow target policy's outputted action for each next_state_ph (for training the critic)
# use stop_gradient to treat the output values as constant targets when doing backprop
slow_target_next_actions = tf.stop_gradient(generate_actor_network(next_state_ph, trainable = False, reuse = False))
# will use this to initialize both the critic network its slowly-changing target network with same structure
def generate_critic_network(s, a, trainable, reuse):
state_action = tf.concat([s, a], axis=1)
hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
hidden_drop = tf.layers.dropout(hidden, rate = dropout_critic, training = trainable & is_training_ph)
hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_critic, training = trainable & is_training_ph)
hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_critic, training = trainable & is_training_ph)
q_values = tf.layers.dense(hidden_drop_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
return q_values
with tf.variable_scope('critic') as scope:
# Critic applied to state_ph and a given action (for training critic)
q_values_of_given_actions = generate_critic_network(state_ph, action_ph, trainable = True, reuse = False)
# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient)
q_values_of_suggested_actions = generate_critic_network(state_ph, actions, trainable = True, reuse = True)
# slow target critic network
with tf.variable_scope('slow_target_critic', reuse=False):
# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic)
slow_q_values_next = tf.stop_gradient(generate_critic_network(next_state_ph, slow_target_next_actions, trainable = False, reuse = False))
# isolate vars for each network
actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')
# update values for slowly-changing targets towards current actor and critic
update_slow_target_ops = []
for i, slow_target_actor_var in enumerate(slow_target_actor_vars):
update_slow_target_actor_op = slow_target_actor_var.assign(tau*actor_vars[i]+(1-tau)*slow_target_actor_var)
update_slow_target_ops.append(update_slow_target_actor_op)
for i, slow_target_var in enumerate(slow_target_critic_vars):
update_slow_target_critic_op = slow_target_var.assign(tau*critic_vars[i]+(1-tau)*slow_target_var)
update_slow_target_ops.append(update_slow_target_critic_op)
update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')
# One step TD targets y_i for (s,a) from experience replay
# = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
# = r_i if s' terminal
targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * gamma * slow_q_values_next
# 1-step temporal difference errors
td_errors = targets - q_values_of_given_actions
# critic loss function (mean-square value error with regularization)
critic_loss = tf.reduce_mean(tf.square(td_errors))
for var in critic_vars:
if not 'bias' in var.name:
critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)
# critic optimizer
critic_train_op = tf.train.AdamOptimizer(lr_critic*lr_decay**episodes).minimize(critic_loss)
# actor loss function (mean Q-values under current policy with regularization)
actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
for var in actor_vars:
if not 'bias' in var.name:
actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)
# actor optimizer
# the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
actor_train_op = tf.train.AdamOptimizer(lr_actor*lr_decay**episodes).minimize(actor_loss, var_list=actor_vars)
# initialize session
sess = tf.Session()
sess.run(tf.global_variables_initializer())
#####################################################################################################
## Training
total_steps = 0
for ep in range(num_episodes):
total_reward = 0
steps_in_ep = 0
# Initialize exploration noise process
noise_process = np.zeros(action_dim)
noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)
# Initial state
observation = env.reset()
if ep%10 == 0: env.render()
for t in range(max_steps_ep):
# choose action based on deterministic policy
action_for_state, = sess.run(actions,
feed_dict = {state_ph: observation[None], is_training_ph: False})
# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
# print(action_for_state)
noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
# print(noise_scale*noise_process)
action_for_state += noise_scale*noise_process
# take step
next_observation, reward, done, _info = env.step(action_for_state)
if ep%10 == 0: env.render()
total_reward += reward
add_to_memory((observation, action_for_state, reward, next_observation,
# is next_observation a terminal state?
# 0.0 if done and not env.env._past_limit() else 1.0))
0.0 if done else 1.0))
# update network weights to fit a minibatch of experience
if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:
# grab N (s,a,r,s') tuples from replay memory
minibatch = sample_from_memory(minibatch_size)
# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
_, _ = sess.run([critic_train_op, actor_train_op],
feed_dict = {
state_ph: np.asarray([elem[0] for elem in minibatch]),
action_ph: np.asarray([elem[1] for elem in minibatch]),
reward_ph: np.asarray([elem[2] for elem in minibatch]),
next_state_ph: np.asarray([elem[3] for elem in minibatch]),
is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
is_training_ph: True})
# update slow actor and critic targets towards current actor and critic
_ = sess.run(update_slow_targets_op)
observation = next_observation
total_steps += 1
steps_in_ep += 1
if done:
# Increment episode counter
_ = sess.run(episode_inc_op)
break
print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))
# Finalize and upload results
writefile('info.json', json.dumps(info))
env.close()
gym.upload(outdir)