Skip to content

Commit

Permalink
Update AgileRL tutorials
Browse files Browse the repository at this point in the history
  • Loading branch information
nicku-a committed Jun 24, 2024
1 parent 1282a0a commit 2fa58e4
Show file tree
Hide file tree
Showing 6 changed files with 398 additions and 320 deletions.
213 changes: 112 additions & 101 deletions tutorials/AgileRL/agilerl_dqn_curriculum.py

Large diffs are not rendered by default.

210 changes: 131 additions & 79 deletions tutorials/AgileRL/agilerl_maddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,23 @@
Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
"""

import os

import numpy as np
import supersuit as ss
import torch
from pettingzoo.atari import space_invaders_v2
from tqdm import trange

from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
from tqdm import trange

from pettingzoo.atari import space_invaders_v2
from agilerl.utils.utils import create_population
from agilerl.wrappers.pettingzoo_wrappers import PettingZooVectorizationParallelWrapper

if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("===== AgileRL MADDPG Demo =====")

# Define the network configuration
NET_CONFIG = {
Expand All @@ -35,15 +36,21 @@
"ALGO": "MADDPG", # Algorithm
# Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
"CHANNELS_LAST": True,
"BATCH_SIZE": 8, # Batch size
"BATCH_SIZE": 32, # Batch size
"O_U_NOISE": True, # Ornstein Uhlenbeck action noise
"EXPL_NOISE": 0.1, # Action noise scale
"MEAN_NOISE": 0.0, # Mean action noise
"THETA": 0.15, # Rate of mean reversion in OU noise
"DT": 0.01, # Timestep for OU noise
"LR_ACTOR": 0.001, # Actor learning rate
"LR_CRITIC": 0.01, # Critic learning rate
"LR_CRITIC": 0.001, # Critic learning rate
"GAMMA": 0.95, # Discount factor
"MEMORY_SIZE": 10000, # Max memory buffer size
"LEARN_STEP": 5, # Learning frequency
"MEMORY_SIZE": 100000, # Max memory buffer size
"LEARN_STEP": 100, # Learning frequency
"TAU": 0.01, # For soft update of target parameters
}

num_envs = 8
# Define the space invaders environment as a parallel environment
env = space_invaders_v2.parallel_env()
if INIT_HP["CHANNELS_LAST"]:
Expand All @@ -53,6 +60,7 @@
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 4)
env = PettingZooVectorizationParallelWrapper(env, n_envs=num_envs)
env.reset()

# Configure the multi-agent algo input arguments
Expand Down Expand Up @@ -84,14 +92,15 @@
INIT_HP["AGENT_IDS"] = env.agents

# Create a population ready for evolutionary hyper-parameter optimisation
pop = initialPopulation(
pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
one_hot,
NET_CONFIG,
INIT_HP,
population_size=INIT_HP["POPULATION_SIZE"],
num_envs=num_envs,
device=device,
)

Expand All @@ -109,8 +118,8 @@
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
evo_step=1,
) # Evaluate using last N fitness scores
eval_loop=1, # Evaluate using last N fitness scores
)

# Instantiate a mutations object (used for HPO)
mutations = Mutations(
Expand All @@ -128,7 +137,7 @@
], # RL hyperparams selected for mutation
mutation_sd=0.1, # Mutation strength
# Define search space for each hyperparameter
min_lr=0.0001,
min_lr=0.00001,
max_lr=0.01,
min_learn_step=1,
max_learn_step=120,
Expand All @@ -141,26 +150,32 @@
)

# Define training loop parameters
max_episodes = 5 # Total episodes (default: 6000)
max_steps = 900 # Maximum steps to take in each episode
epsilon = 1.0 # Starting epsilon value
eps_end = 0.1 # Final epsilon value
eps_decay = 0.995 # Epsilon decay
evo_epochs = 20 # Evolution frequency
evo_loop = 1 # Number of evaluation episodes
max_steps = 4500 # Max steps (default: 2000000)
learning_delay = 500 # Steps before starting learning
evo_steps = 10000 # Evolution frequency
eval_steps = None # Evaluation steps per episode - go until done
eval_loop = 1 # Number of evaluation episodes
elite = pop[0] # Assign a placeholder "elite" agent

# Training loop
for idx_epi in trange(max_episodes):
total_steps = 0

# TRAINING LOOP
print("Training...")
pbar = trange(max_steps, unit="step")
while np.less([agent.steps[-1] for agent in pop], max_steps).all():
pop_episode_scores = []
for agent in pop: # Loop through population
state, info = env.reset() # Reset environment at start of episode
agent_reward = {agent_id: 0 for agent_id in env.agents}
scores = np.zeros(num_envs)
completed_episode_scores = []
steps = 0
if INIT_HP["CHANNELS_LAST"]:
state = {
agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
agent_id: np.moveaxis(s, [-1], [-3])
for agent_id, s in state.items()
}
for _ in range(max_steps):

for idx_step in range(evo_steps // num_envs):
agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
env_defined_actions = (
info["env_defined_actions"]
Expand All @@ -169,87 +184,124 @@
)

# Get next action from agent
cont_actions, discrete_action = agent.getAction(
state, epsilon, agent_mask, env_defined_actions
cont_actions, discrete_action = agent.get_action(
states=state,
training=True,
agent_mask=agent_mask,
env_defined_actions=env_defined_actions,
)
if agent.discrete_actions:
action = discrete_action
else:
action = cont_actions

next_state, reward, termination, truncation, info = env.step(
action
) # Act in environment
# Act in environment
next_state, reward, termination, truncation, info = env.step(action)

scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
total_steps += num_envs
steps += num_envs

# Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [-1], [-3])
for agent_id, ns in next_state.items()
}

# Save experiences to replay buffer
memory.save2memory(state, cont_actions, reward, next_state, termination)

# Collect the reward
for agent_id, r in reward.items():
agent_reward[agent_id] += r
memory.save_to_memory(
state,
cont_actions,
reward,
next_state,
termination,
is_vectorised=True,
)

# Learn according to learning frequency
if (memory.counter % agent.learn_step == 0) and (
len(memory) >= agent.batch_size
# Handle learn steps > num_envs
if agent.learn_step > num_envs:
learn_step = agent.learn_step // num_envs
if (
idx_step % learn_step == 0
and len(memory) >= agent.batch_size
and memory.counter > learning_delay
):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)
# Handle num_envs > learn step; learn multiple times per step in env
elif (
len(memory) >= agent.batch_size and memory.counter > learning_delay
):
experiences = memory.sample(
agent.batch_size
) # Sample replay buffer
agent.learn(experiences) # Learn according to agent's RL algorithm
for _ in range(num_envs // agent.learn_step):
# Sample replay buffer
experiences = memory.sample(agent.batch_size)
# Learn according to agent's RL algorithm
agent.learn(experiences)

# Update the state
if INIT_HP["CHANNELS_LAST"]:
next_state = {
agent_id: np.expand_dims(ns, 0)
for agent_id, ns in next_state.items()
}
state = next_state

# Stop episode if any agents have terminated
if any(truncation.values()) or any(termination.values()):
break

# Save the total episode reward
score = sum(agent_reward.values())
agent.scores.append(score)

# Update epsilon for exploration
epsilon = max(eps_end, epsilon * eps_decay)

# Now evolve population if necessary
if (idx_epi + 1) % evo_epochs == 0:
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=max_steps,
loop=evo_loop,
)
for agent in pop
]
# Calculate scores and reset noise for finished episodes
reset_noise_indices = []
term_array = np.array(list(termination.values())).transpose()
trunc_array = np.array(list(truncation.values())).transpose()
for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
if np.any(d) or np.any(t):
completed_episode_scores.append(scores[idx])
agent.scores.append(scores[idx])
scores[idx] = 0
reset_noise_indices.append(idx)
agent.reset_action_noise(reset_noise_indices)

pbar.update(evo_steps // len(pop))

agent.steps[-1] += steps
pop_episode_scores.append(completed_episode_scores)

print(f"Episode {idx_epi + 1}/{max_episodes}")
print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
print(
f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
# Evaluate population
fitnesses = [
agent.test(
env,
swap_channels=INIT_HP["CHANNELS_LAST"],
max_steps=eval_steps,
loop=eval_loop,
)
for agent in pop
]
mean_scores = [
(
np.mean(episode_scores)
if len(episode_scores) > 0
else "0 completed episodes"
)
for episode_scores in pop_episode_scores
]

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)
print(f"--- Global steps {total_steps} ---")
print(f"Steps {[agent.steps[-1] for agent in pop]}")
print(f"Scores: {mean_scores}")
print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
print(
f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
)

# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)

# Update step counter
for agent in pop:
agent.steps.append(agent.steps[-1])

# Save the trained algorithm
path = "./models/MADDPG"
filename = "MADDPG_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
elite.saveCheckpoint(save_path)
elite.save_checkpoint(save_path)

pbar.close()
env.close()
Loading

0 comments on commit 2fa58e4

Please sign in to comment.