ddpg_tensorflow.py

# From https://github.com/heerad/gymtime/blob/master/ddpg.py

import numpy as np
import gym
from gym import wrappers
import tensorflow as tf
import json, sys, os
from os import path
import random
from collections import deque

#####################################################################################################
## Algorithm

# Deep Deterministic Policy Gradient (DDPG)
# An off-policy actor-critic algorithm that uses additive exploration noise (e.g. an Ornstein-Uhlenbeck process) on top
# of a deterministic policy to generate experiences (s, a, r, s'). It uses minibatches of these experiences from replay 
# memory to update the actor (policy) and critic (Q function) parameters.
# Neural networks are used for function approximation.
# Slowly-changing "target" networks are used to improve stability and encourage convergence.
# Parameter updates are made via Adam.
# Assumes continuous action spaces!

#####################################################################################################
## Setup

env_to_use = 'Pendulum-v0'

# hyperparameters
gamma = 0.99				# reward discount factor
h1_actor = 8				# hidden layer 1 size for the actor
h2_actor = 8				# hidden layer 2 size for the actor
h3_actor = 8				# hidden layer 3 size for the actor
h1_critic = 8				# hidden layer 1 size for the critic
h2_critic = 8				# hidden layer 2 size for the critic
h3_critic = 8				# hidden layer 3 size for the critic
lr_actor = 1e-3				# learning rate for the actor
lr_critic = 1e-3			# learning rate for the critic
lr_decay = 1				# learning rate decay (per episode)
l2_reg_actor = 1e-6			# L2 regularization factor for the actor
l2_reg_critic = 1e-6		# L2 regularization factor for the critic
dropout_actor = 0			# dropout rate for actor (0 = no dropout)
dropout_critic = 0			# dropout rate for critic (0 = no dropout)
num_episodes = 15000		# number of episodes
max_steps_ep = 10000	# default max number of steps per episode (unless env has a lower hardcoded limit)
tau = 1e-2				# soft target update rate
train_every = 1			# number of steps to run the policy (and collect experience) before updating network weights
replay_memory_capacity = int(1e5)	# capacity of experience replay memory
minibatch_size = 1024	# size of minibatch from experience replay memory for updates
initial_noise_scale = 0.1	# scale of the exploration noise process (1.0 is the range of each action dimension)
noise_decay = 0.99		# decay rate (per episode) of the scale of the exploration noise process
exploration_mu = 0.0	# mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_sigma = 0.2	# sigma parameter for the exploration noise process: dXt = theta*(mu-Xt	)*dt + sigma*dWt

# game parameters
env = gym.make(env_to_use)
state_dim = np.prod(np.array(env.observation_space.shape)) 	# Get total number of dimensions in state
action_dim = np.prod(np.array(env.action_space.shape))		# Assuming continuous action space

# set seeds to 0
env.seed(0)
np.random.seed(0)

# prepare monitorings
outdir = '/tmp/ddpg-agent-results'
env = wrappers.Monitor(env, outdir, force=True)
def writefile(fname, s):
    with open(path.join(outdir, fname), 'w') as fh: fh.write(s)
info = {}
info['env_id'] = env.spec.id
info['params'] = dict(
	gamma = gamma,
	h1_actor = h1_actor,
	h2_actor = h2_actor,
	h3_actor = h3_actor,
	h1_critic = h1_critic,
	h2_critic = h2_critic,
	h3_critic = h3_critic,
	lr_actor = lr_actor,
	lr_critic = lr_critic,
	lr_decay = lr_decay,
	l2_reg_actor = l2_reg_actor,
	l2_reg_critic = l2_reg_critic,
	dropout_actor = dropout_actor,
	dropout_critic = dropout_critic,
	num_episodes = num_episodes,
	max_steps_ep = max_steps_ep,
	tau = tau,
	train_every = train_every,
	replay_memory_capacity = replay_memory_capacity,
	minibatch_size = minibatch_size,
	initial_noise_scale = initial_noise_scale,
	noise_decay = noise_decay,
	exploration_mu = exploration_mu,
	exploration_theta = exploration_theta,
	exploration_sigma = exploration_sigma
)

np.set_printoptions(threshold=np.nan)

replay_memory = deque(maxlen=replay_memory_capacity)			# used for O(1) popleft() operation

def add_to_memory(experience):
	replay_memory.append(experience)

def sample_from_memory(minibatch_size):
	return random.sample(replay_memory, minibatch_size)

#####################################################################################################
## Tensorflow

tf.reset_default_graph()

# placeholders
state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
action_ph = tf.placeholder(dtype=tf.float32, shape=[None,action_dim])
reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,state_dim])
is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation)
is_training_ph = tf.placeholder(dtype=tf.bool, shape=()) # for dropout

# episode counter
episodes = tf.Variable(0.0, trainable=False, name='episodes')
episode_inc_op = episodes.assign_add(1)

# will use this to initialize both the actor network its slowly-changing target network with same structure
def generate_actor_network(s, trainable, reuse):
	hidden = tf.layers.dense(s, h1_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
	hidden_drop = tf.layers.dropout(hidden, rate = dropout_actor, training = trainable & is_training_ph)
	hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_actor, training = trainable & is_training_ph)
	hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_actor, training = trainable & is_training_ph)
	actions_unscaled = tf.layers.dense(hidden_drop_3, action_dim, trainable = trainable, name = 'dense_3', reuse = reuse)
	actions = env.action_space.low + tf.nn.sigmoid(actions_unscaled)*(env.action_space.high - env.action_space.low) # bound the actions to the valid range
	return actions

# actor network
with tf.variable_scope('actor'):
	# Policy's outputted action for each state_ph (for generating actions and training the critic)
	actions = generate_actor_network(state_ph, trainable = True, reuse = False)

# slow target actor network
with tf.variable_scope('slow_target_actor', reuse=False):
	# Slow target policy's outputted action for each next_state_ph (for training the critic)
	# use stop_gradient to treat the output values as constant targets when doing backprop
	slow_target_next_actions = tf.stop_gradient(generate_actor_network(next_state_ph, trainable = False, reuse = False))

# will use this to initialize both the critic network its slowly-changing target network with same structure
def generate_critic_network(s, a, trainable, reuse):
	state_action = tf.concat([s, a], axis=1)
	hidden = tf.layers.dense(state_action, h1_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense', reuse = reuse)
	hidden_drop = tf.layers.dropout(hidden, rate = dropout_critic, training = trainable & is_training_ph)
	hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_1', reuse = reuse)
	hidden_drop_2 = tf.layers.dropout(hidden_2, rate = dropout_critic, training = trainable & is_training_ph)
	hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation = tf.nn.relu, trainable = trainable, name = 'dense_2', reuse = reuse)
	hidden_drop_3 = tf.layers.dropout(hidden_3, rate = dropout_critic, training = trainable & is_training_ph)
	q_values = tf.layers.dense(hidden_drop_3, 1, trainable = trainable, name = 'dense_3', reuse = reuse)
	return q_values

with tf.variable_scope('critic') as scope:
	# Critic applied to state_ph and a given action (for training critic)
	q_values_of_given_actions = generate_critic_network(state_ph, action_ph, trainable = True, reuse = False)
	# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient)
	q_values_of_suggested_actions = generate_critic_network(state_ph, actions, trainable = True, reuse = True)

# slow target critic network
with tf.variable_scope('slow_target_critic', reuse=False):
	# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic)
	slow_q_values_next = tf.stop_gradient(generate_critic_network(next_state_ph, slow_target_next_actions, trainable = False, reuse = False))

# isolate vars for each network
actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_actor')
critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic')
slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='slow_target_critic')

# update values for slowly-changing targets towards current actor and critic
update_slow_target_ops = []
for i, slow_target_actor_var in enumerate(slow_target_actor_vars):
	update_slow_target_actor_op = slow_target_actor_var.assign(tau*actor_vars[i]+(1-tau)*slow_target_actor_var)
	update_slow_target_ops.append(update_slow_target_actor_op)

for i, slow_target_var in enumerate(slow_target_critic_vars):
	update_slow_target_critic_op = slow_target_var.assign(tau*critic_vars[i]+(1-tau)*slow_target_var)
	update_slow_target_ops.append(update_slow_target_critic_op)

update_slow_targets_op = tf.group(*update_slow_target_ops, name='update_slow_targets')

# One step TD targets y_i for (s,a) from experience replay
# = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal
# = r_i if s' terminal
targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * gamma * slow_q_values_next

# 1-step temporal difference errors
td_errors = targets - q_values_of_given_actions

# critic loss function (mean-square value error with regularization)
critic_loss = tf.reduce_mean(tf.square(td_errors))
for var in critic_vars:
	if not 'bias' in var.name:
		critic_loss += l2_reg_critic * 0.5 * tf.nn.l2_loss(var)

# critic optimizer
critic_train_op = tf.train.AdamOptimizer(lr_critic*lr_decay**episodes).minimize(critic_loss)

# actor loss function (mean Q-values under current policy with regularization)
actor_loss = -1*tf.reduce_mean(q_values_of_suggested_actions)
for var in actor_vars:
	if not 'bias' in var.name:
		actor_loss += l2_reg_actor * 0.5 * tf.nn.l2_loss(var)

# actor optimizer
# the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed)
actor_train_op = tf.train.AdamOptimizer(lr_actor*lr_decay**episodes).minimize(actor_loss, var_list=actor_vars)

# initialize session
sess = tf.Session()	
sess.run(tf.global_variables_initializer())

#####################################################################################################
## Training

total_steps = 0
for ep in range(num_episodes):

	total_reward = 0
	steps_in_ep = 0

	# Initialize exploration noise process
	noise_process = np.zeros(action_dim)
	noise_scale = (initial_noise_scale * noise_decay**ep) * (env.action_space.high - env.action_space.low)

	# Initial state
	observation = env.reset()
	if ep%10 == 0: env.render()

	for t in range(max_steps_ep):

		# choose action based on deterministic policy
		action_for_state, = sess.run(actions, 
			feed_dict = {state_ph: observation[None], is_training_ph: False})

		# add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process)
		# print(action_for_state)
		noise_process = exploration_theta*(exploration_mu - noise_process) + exploration_sigma*np.random.randn(action_dim)
		# print(noise_scale*noise_process)
		action_for_state += noise_scale*noise_process

		# take step
		next_observation, reward, done, _info = env.step(action_for_state)
		if ep%10 == 0: env.render()
		total_reward += reward

		add_to_memory((observation, action_for_state, reward, next_observation, 
			# is next_observation a terminal state?
			# 0.0 if done and not env.env._past_limit() else 1.0))
			0.0 if done else 1.0))

		# update network weights to fit a minibatch of experience
		if total_steps%train_every == 0 and len(replay_memory) >= minibatch_size:

			# grab N (s,a,r,s') tuples from replay memory
			minibatch = sample_from_memory(minibatch_size)

			# update the critic and actor params using mean-square value error and deterministic policy gradient, respectively
			_, _ = sess.run([critic_train_op, actor_train_op], 
				feed_dict = {
					state_ph: np.asarray([elem[0] for elem in minibatch]),
					action_ph: np.asarray([elem[1] for elem in minibatch]),
					reward_ph: np.asarray([elem[2] for elem in minibatch]),
					next_state_ph: np.asarray([elem[3] for elem in minibatch]),
					is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch]),
					is_training_ph: True})

			# update slow actor and critic targets towards current actor and critic
			_ = sess.run(update_slow_targets_op)

		observation = next_observation
		total_steps += 1
		steps_in_ep += 1
		
		if done: 
			# Increment episode counter
			_ = sess.run(episode_inc_op)
			break
		
	print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: %7.3f'%(ep,total_reward,steps_in_ep, noise_scale))

# Finalize and upload results
writefile('info.json', json.dumps(info))
env.close()
gym.upload(outdir)