diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..ab430aa8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "regym/thirdparty/ReferentialGym"] + path = regym/thirdparty/ReferentialGym + url = https://github.com/Near32/ReferentialGym diff --git a/benchmark/DQN/atari_10M/updated_benchmark_nbitsswap.py b/benchmark/DQN/atari_10M/updated_benchmark_nbitsswap.py new file mode 100644 index 00000000..6ddb07da --- /dev/null +++ b/benchmark/DQN/atari_10M/updated_benchmark_nbitsswap.py @@ -0,0 +1,492 @@ +from typing import Dict, Any, Optional, List, Callable +import logging +import yaml +import os +import sys +from typing import Dict + +import torch.multiprocessing + +from tensorboardX import SummaryWriter +from tqdm import tqdm +from functools import partial + + +import torch +import numpy as np +import random +import gym + +import regym +from regym.environments import generate_task, EnvType +from regym.rl_loops.singleagent_loops import rl_loop +from regym.rl_loops.multiagent_loops import marl_loop + +from regym.util.experiment_parsing import initialize_agents + +from regym.util.wrappers import ClipRewardEnv, PreviousRewardActionInfoMultiAgentWrapper + +import ray + +from regym.modules import EnvironmentModule, CurrentAgentsModule +from regym.pubsub_manager import PubSubManager + + +from regym.util.wrappers import baseline_atari_pixelwrap + + +def make_rl_pubsubmanager( + agents, + config, + logger=None, + load_path=None, + save_path=None): + """ + Create a PubSubManager. + :param agents: List of Agents to use in the rl loop. + :param config: Dict that specifies all the important hyperparameters of the network. + - "task" + - "sad" + - "vdn" + - "max_obs_count" + - "sum_writer": str where to save the summary... + + """ + modules = config.pop("modules") + + cam_id = "current_agents" + modules[cam_id] = CurrentAgentsModule( + id=cam_id, + agents=agents + ) + + envm_id = "EnvironmentModule_0" + envm_input_stream_ids = { + #"logger":"modules:logger:ref", + #"logs_dict":"logs_dict", + + "iteration":"signals:iteration", + + "current_agents":f"modules:{cam_id}:ref", + } + modules[envm_id] = EnvironmentModule( + id=envm_id, + config=config, + input_stream_ids=envm_input_stream_ids + ) + + pipelines = config.pop("pipelines") + + pipelines["rl_loop_0"] = [ + envm_id, + ] + + optim_id = "global_optim" + optim_config = { + "modules":modules, + "learning_rate":3e-4, + "optimizer_type":'adam', + "with_gradient_clip":False, + "adam_eps":1e-16, + } + + optim_module = regym.modules.build_OptimizationModule( + id=optim_id, + config=optim_config, + ) + modules[optim_id] = optim_module + + logger_id = "per_epoch_logger" + logger_module = regym.modules.build_PerEpochLoggerModule(id=logger_id) + modules[logger_id] = logger_module + + pipelines[optim_id] = [] + pipelines[optim_id].append(optim_id) + pipelines[optim_id].append(logger_id) + + pbm = PubSubManager( + config=config, + modules=modules, + pipelines=pipelines, + logger=logger, + load_path=load_path, + save_path=save_path, + ) + + return pbm + +class SingleObservationWrapper(gym.Wrapper): + """ + Assumes the :arg env: environment to have a Dict observation space, + that contains the key :arg observation_key:. + This wrapper makes the observation space consisting of solely the :arg observation_key: entry, + while the other entries are put in the infos dictionnary. + Args: + env (gym.Env): Env to wrap. + observation_key (str): key to the actual observation + """ + + def __init__(self, env, observation_key): + super(SingleObservationWrapper, self).__init__(env) + self.observation_key = observation_key + self.observation_space = env.observation_space.spaces[self.observation_key] + + self.action_space = env.action_space + + def reset(self, **kwargs): + observations, infos = self.env.reset(**kwargs) + + new_observations = observations[self.observation_key] + + for k,v in observations.items(): + if k==self.observation_key: continue + infos[k] = np.expand_dims(np.array(v), axis=0) + + return new_observations, infos + + def step(self, action): + next_observations, reward, done, next_infos = self.env.step(action) + + new_next_observations = next_observations[self.observation_key] + + for k,v in next_observations.items(): + if k==self.observation_key: continue + next_infos[k] = np.expand_dims(np.array(v), axis=0) + + return new_next_observations, reward, done, next_infos + + def render(self, mode='human', **kwargs): + env = self.unwrapped + return env.render( + mode=mode, + **kwargs, + ) + + +def env_r2d2_wrap( + env, + env_wrapper, + clip_reward=False, + previous_reward_action=True, + ): + env = env_wrapper(env) + + if clip_reward: + env = ClipRewardEnv(env) + + if previous_reward_action: + env = PreviousRewardActionInfoMultiAgentWrapper(env=env) + + return SingleObservationWrapper(env=env, observation_key="observation") + + +def check_path_for_agent(filepath): + #filepath = os.path.join(path,filename) + agent = None + offset_episode_count = 0 + if os.path.isfile(filepath): + print('==> loading checkpoint {}'.format(filepath)) + agent = torch.load(filepath) + offset_episode_count = agent.episode_count + #setattr(agent, 'episode_count', offset_episode_count) + print('==> loaded checkpoint {}'.format(filepath)) + return agent, offset_episode_count + + +def train_and_evaluate(agent: object, + task: object, + sum_writer: object, + base_path: str, + offset_episode_count: int = 0, + nbr_pretraining_steps: int = 0, + nbr_max_observations: int = 1e7, + test_obs_interval: int = 1e4, + test_nbr_episode: int = 10, + benchmarking_record_episode_interval: int = None, + render_mode="rgb_array", + step_hooks=[]): + pubsub = False + if len(sys.argv) > 2: + pubsub = any(['pubsub' in arg for arg in sys.argv]) + + if pubsub: + import ipdb; ipdb.set_trace() + config = { + "modules": {}, + "pipelines": {}, + } + + config['training'] = True + config['env_configs'] = None + config['task'] = task + + sum_writer_path = os.path.join(sum_writer, 'actor.log') + sum_writer = config['sum_writer'] = SummaryWriter(sum_writer_path, flush_secs=1) + + config['base_path'] = base_path + config['offset_episode_count'] = offset_episode_count + config['nbr_pretraining_steps'] = nbr_pretraining_steps + config['max_obs_count'] = nbr_max_observations + config['test_obs_interval'] = test_obs_interval + config['test_nbr_episode'] = test_nbr_episode + config['benchmarking_record_episode_interval'] = benchmarking_record_episode_interval + config['render_mode'] = render_mode + config['step_hooks'] = step_hooks + config['save_traj_length_divider'] = 1 + config['nbr_players'] = 1 + pubsubmanager = make_rl_pubsubmanager( + agents=[agent], + config=config, + logger=sum_writer, + ) + + pubsubmanager.train() + + trained_agent = agent + else: + async = False + if len(sys.argv) > 2: + async = any(['async' in arg for arg in sys.argv]) + + if async: + trained_agent = rl_loop.async_gather_experience_parallel1( + task, + agent, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + ) + else: + trained_agent = rl_loop.gather_experience_parallel( + task=task, + agent=agent, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + ) + + save_replay_buffer = False + if len(sys.argv) > 2: + save_replay_buffer = any(['save_replay_buffer' in arg for arg in sys.argv]) + + for agent in trained_agents: + agent.save(with_replay_buffer=save_replay_buffer) + print(f"Agent saved at: {agent.save_path}") + + task.env.close() + task.test_env.close() + + return trained_agent + + +def training_process(agent_config: Dict, + task_config: Dict, + benchmarking_interval: int = 1e4, + benchmarking_episodes: int = 10, + benchmarking_record_episode_interval: int = None, + train_observation_budget: int = 1e7, + base_path: str = './', + seed: int = 0): + pubsub = False + test_only = False + if len(sys.argv) > 2: + pubsub = any(['pubsub' in arg for arg in sys.argv]) + test_only = any(['test_only' in arg for arg in sys.argv]) + + if test_only: + base_path = os.path.join(base_path,"TESTING") + else: + base_path = os.path.join(base_path,"TRAINING") + + if pubsub: + base_path = os.path.join(base_path,"PUBSUB") + else: + base_path = os.path.join(base_path,"NOPUBSUB") + + print(f"Final Path: -- {base_path} --") + + if not os.path.exists(base_path): os.makedirs(base_path) + + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) + if hasattr(torch.backends, "cudnn"): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + pixel_wrapping_fn = partial(baseline_atari_pixelwrap, + size=task_config['observation_resize_dim'], + skip=task_config['nbr_frame_skipping'], + stack=task_config['nbr_frame_stacking'], + grayscale=task_config['grayscale'], + single_life_episode=task_config['single_life_episode'], + nbr_max_random_steps=task_config['nbr_max_random_steps'], + clip_reward=task_config['clip_reward']) + pixel_wrapping_fn = partial(env_r2d2_wrap, + env_wrapper=pixel_wrapping_fn, + previous_reward_action=task_config.get('previous_reward_action', False) + ) + + test_pixel_wrapping_fn = partial(baseline_atari_pixelwrap, + size=task_config['observation_resize_dim'], + skip=task_config['nbr_frame_skipping'], + stack=task_config['nbr_frame_stacking'], + grayscale=task_config['grayscale'], + single_life_episode=False, + nbr_max_random_steps=task_config['nbr_max_random_steps'], + clip_reward=False) + test_pixel_wrapping_fn = partial(env_r2d2_wrap, + env_wrapper=test_pixel_wrapping_fn, + previous_reward_action=task_config.get('previous_reward_action', False) + ) + + video_recording_dirpath = os.path.join(base_path,'videos') + video_recording_render_mode = 'human_comm' + task = generate_task( + task_config['env-id'], + env_type=EnvType.SINGLE_AGENT, + nbr_parallel_env=task_config['nbr_actor'], + wrapping_fn=pixel_wrapping_fn, + test_wrapping_fn=test_pixel_wrapping_fn, + seed=seed, + test_seed=100+seed, + gathering=True, + train_video_recording_episode_period=benchmarking_record_episode_interval, + train_video_recording_dirpath=video_recording_dirpath, + train_video_recording_render_mode=video_recording_render_mode, + ) + + agent_config['nbr_actor'] = task_config['nbr_actor'] + + regym.RegymSummaryWriterPath = base_path #regym.RegymSummaryWriter = GlobalSummaryWriter(base_path) + sum_writer = base_path + + save_path1 = os.path.join(base_path,f"./{task_config['agent-id']}.agent") + if task_config.get("reload", 'None')!='None': + import ipdb; ipdb.set_trace() + agent, offset_episode_count = check_path_for_agent(task_config["reload"]) + else: + agent, offset_episode_count = check_path_for_agent(save_path1) + + if agent is None: + agent = initialize_agents( + task=task, + agent_configurations={task_config['agent-id']: agent_config} + )[0] + agent.save_path = save_path1 + #regym.rl_algorithms.algorithms.DQN.dqn.summary_writer = sum_writer + + if test_only: + print(save_path1) + import ipdb; ipdb.set_trace() + agent.training = False + + trained_agent = train_and_evaluate( + agent=agent, + task=task, + sum_writer=sum_writer, + base_path=base_path, + offset_episode_count=offset_episode_count, + nbr_pretraining_steps=int(float(agent_config["nbr_pretraining_steps"])) if "nbr_pretraining_steps" in agent_config else 0, + nbr_max_observations=train_observation_budget, + test_obs_interval=benchmarking_interval, + test_nbr_episode=benchmarking_episodes, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + #render_mode="human_comm", + ) + + return trained_agent, task + + +def load_configs(config_file_path: str): + all_configs = yaml.load(open(config_file_path)) + + agents_config = all_configs['agents'] + experiment_config = all_configs['experiment'] + envs_config = experiment_config['tasks'] + + return experiment_config, agents_config, envs_config + + +def main(): + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger('NBitsSwap(MNIST) Benchmark') + + config_file_path = sys.argv[1] #'./atari_10M_benchmark_config.yaml' + experiment_config, agents_config, tasks_configs = load_configs(config_file_path) + + # Generate path for experiment + base_path = experiment_config['experiment_id'] + if not os.path.exists(base_path): os.mkdir(base_path) + + for task_config in tasks_configs: + agent_name = task_config['agent-id'] + env_name = task_config['env-id'] + run_name = task_config['run-id'] + path = f'{base_path}/{env_name}/{run_name}/{agent_name}' + print(f"Tentative Path: -- {path} --") + training_process(agents_config[task_config['agent-id']], task_config, + benchmarking_interval=int(float(experiment_config['benchmarking_interval'])), + benchmarking_episodes=int(float(experiment_config['benchmarking_episodes'])), + benchmarking_record_episode_interval=int(float(experiment_config['benchmarking_record_episode_interval'])), + train_observation_budget=int(float(experiment_config['train_observation_budget'])), + base_path=path, + seed=experiment_config['seed']) + +if __name__ == '__main__': + async = False + __spec__ = None + if len(sys.argv) > 2: + async = any(['async' in arg for arg in sys.argv]) + if async: + torch.multiprocessing.freeze_support() + torch.multiprocessing.set_start_method("forkserver", force=True) + #torch.multiprocessing.set_start_method("spawn", force=True) + ray.init() #local_mode=True) + + from regym import CustomManager as Manager + from multiprocessing.managers import SyncManager, MakeProxyType, public_methods + + # from regym.rl_algorithms.replay_buffers import SharedPrioritizedReplayStorage + # #SharedPrioritizedReplayStorageProxy = MakeProxyType("SharedPrioritizedReplayStorage", public_methods(SharedPrioritizedReplayStorage)) + # Manager.register("SharedPrioritizedReplayStorage", + # SharedPrioritizedReplayStorage,# SharedPrioritizedReplayStorageProxy) + # exposed=[ + # "get_beta", + # "get_tree_indices", + # "cat", + # "reset", + # "add_key", + # "total", + # "__len__", + # "priority", + # "sequence_priority", + # "update", + # "add", + # "sample", + # ] + # ) + # print("WARNING: SharedPrioritizedReplayStorage class has been registered with the RegymManager.") + + regym.RegymManager = Manager() + regym.RegymManager.start() + + main() diff --git a/benchmark/DQN/atari_10M/updated_bitsswap_benchmark_HER_final_config.yaml b/benchmark/DQN/atari_10M/updated_bitsswap_benchmark_HER_final_config.yaml new file mode 100644 index 00000000..1e9a6ac6 --- /dev/null +++ b/benchmark/DQN/atari_10M/updated_bitsswap_benchmark_HER_final_config.yaml @@ -0,0 +1,653 @@ +extra_hyperparameters: &extra_hyperparameters + lr_account_for_nbr_actor: False + weights_decay_lambda: 1.0 + # R2D2: weights_decay_lambda: 0.0 + weights_entropy_lambda: 0.0 #01 + + use_target_to_gather_data: False + goal_oriented: True + goal_state_shared_arch: False + goal_state_flattening: True + nbr_training_iteration_per_cycle: 40 + nbr_episode_per_cycle: 16 + + # HER: + HER_target_clamping: True + + #################################### + # New hyperparameters: + PER_compute_initial_priority: False + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: True + sequence_replay_unroll_length: 80 + sequence_replay_overlap_length: 40 + sequence_replay_burn_in_length: 20 + + sequence_replay_PER_eta: 0.9 + + #vdn: False + #vdn_nbr_players: 2 + + ##################################### + +LargeCNN: &LargeCNN + #sad: True + + phi_arch: 'CNN' + # R2D2: + actor_arch: 'None' + critic_arch: 'None' + # R2D2: + #critic_arch: 'LSTM-RNN' + + goal_phi_arch: 'None' + + # Phi Body: + # phi_arch_channels: [32, 64, 64] + # phi_arch_kernels: [8, 4, 3] + # phi_arch_strides: [4, 2, 1] + # phi_arch_paddings: [1, 1, 1] + # phi_arch_feature_dim: 512 + # phi_arch_hidden_units: [512,] + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_channels: [32, 64, 64] + #phi_arch_kernels: [8, 4, 3] + phi_arch_kernels: [3, 3, 3] + #phi_arch_strides: [4, 2, 1] + phi_arch_strides: [2, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + critic_arch_hidden_units: [] + # R2D2: + # #critic_arch_feature_dim: 32 + # critic_arch_hidden_units: [512, 512] + + # Goal Phi Body: + goal_phi_arch_channels: [32, 64, 64] + goal_phi_arch_kernels: [8, 4, 3] + goal_phi_arch_strides: [4, 2, 1] + goal_phi_arch_paddings: [1, 1, 1] + goal_phi_arch_feature_dim: 512 + goal_phi_arch_hidden_units: [512,] + + # Critic architecture: + goal_critic_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'], + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'], + }, + 'desired_goal':{ + shape: ['task.observation_shape'], + target_location: ['phi_body', 'extra_inputs'], + }, + ######################## + # WITH SAD: + ######################## + # 'greedy_action':{ + # shape: ['task.action_dim',], + # target_location: ['critic_body', 'extra_inputs'] + # }, + ######################## + ######################## + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + +smallMLP: &smallMLP + #sad: True + # R2D2: + actor_arch: 'None' + # R2D2: + #critic_arch: 'LSTM-RNN' + + # R2D2: + # #critic_arch_feature_dim: 32 + # critic_arch_hidden_units: [512, 512] + + + phi_arch: 'MLP' + critic_arch: 'None' + + goal_phi_arch: 'None' + + # Phi Body: + phi_arch_feature_dim: 256 + phi_arch_hidden_units: [256,] + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + critic_arch_hidden_units: [] + + # Goal Phi Body: + goal_phi_arch_feature_dim: 256 + goal_phi_arch_hidden_units: [256,] + + # Critic architecture: + goal_critic_arch_hidden_units: [] + + extra_inputs_infos: { + # 'previous_reward':{ + # shape: [1,], + # target_location: ['critic_body', 'extra_inputs'], + # }, + # 'previous_action':{ + # shape: ['task.action_dim',], + # target_location: ['critic_body', 'extra_inputs'], + # }, + 'desired_goal':{ + shape: ['task.observation_shape'], + target_location: ['phi_body', 'extra_inputs'], + }, + ######################## + # WITH SAD: + ######################## + # 'greedy_action':{ + # shape: ['task.action_dim',], + # target_location: ['critic_body', 'extra_inputs'] + # }, + ######################## + ######################## + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + +dqn_LargeCNN_obs84_graclip5m1_b32_tau1m2_lr25m5: &dqn_LargeCNN_obs84_graclip5m1_b32_tau1m2_lr25m5 + double: False + dueling: False + noisy: False + n_step: 1 + + use_PER: False + PER_alpha: 0.6 + PER_beta: 1.0 + + replay_capacity: 1e6 + min_capacity: 1e4 + replay_period: 1 + + use_HER: False + HER_strategy: 'future-4' + + observation_resize_dim: None + goal_resize_dim: None + + discount: 0.99 + use_cuda: True + gradient_clip: 0.5 + batch_size: 32 + tau: 1.0e-2 + learning_rate: 2.5e-4 + adam_eps: 1.0e-8 + + epsstart: 1.0 + epsend: 0.01 #0.1 + epsdecay: 3000 #1000000 + + <<: *LargeCNN + <<: *extra_hyperparameters + + +dqn_smallMLP_obs84_graclip5m1_b32_tau1m2_lr25m5: &dqn_smallMLP_obs84_graclip5m1_b32_tau1m2_lr25m5 + double: False + dueling: False + noisy: False + n_step: 1 + + use_PER: False + PER_alpha: 0.6 + PER_beta: 1.0 + + replay_capacity: 1e6 + min_capacity: 128 #1e4 + replay_period: 240 #240 + + use_HER: False + HER_strategy: 'final-1' + #HER_strategy: 'future-4' + + observation_resize_dim: None + goal_resize_dim: None + + discount: 0.98 #0.99 + use_cuda: True + gradient_clip: 0.5 #0.5 + batch_size: 128 #32 + tau: 2.5e-2 #1.0e-2 + learning_rate: 1.0e-3 #2.5e-4 + adam_eps: 1.0e-8 + + epsstart: 1.0 + epsend: 0.02 #0.1 + epsdecay: 500 #1000000 + epsdecay_strategy: 'None' + + <<: *smallMLP + <<: *extra_hyperparameters + +r2d2_LargeCNN_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNN_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + #observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + use_HER: False + HER_strategy: 'future-4' + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNN + <<: *extra_hyperparameters + +r2d2_smallMLP_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_smallMLP_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + #observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + use_HER: False + HER_strategy: 'future-4' + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *smallMLP + <<: *extra_hyperparameters + +experiment: + tasks: [{ + #'env-id': '20BitsSwap-v0', + 'env-id': '10BitsSwap-v0', + #'env-id': '15BitsSwap-v0', + + #'run-id': 'B7/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1_lr1m3/Seed10_venv_dqn_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40_GradClip5m1', + #'run-id': 'WithHERTargetClamping/WithProperEpisodeStoring/B7/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1_lr1m3/Seed10_venv1_r2d2_her_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40_GradClip5m1', + # LR 1e-3 : too aggressive... + #'run-id': 'WithHERTargetClamping/RepP1/AdamEPS1m12+LR1m3+GradClip5m1/TrainItPerCycle40/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1/Seed10_venv1_r2d2_her_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40', + # LR 1e-4 : good bootstrap but oscillating... + Grad Clip 5m0 + #'run-id': 'WithHERTargetClamping/RepP1/AdamEPS1m12+LR1m4+GradClip5m0/TrainItPerCycle40/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1/Seed10_venv1_r2d2_her_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40', + # LR 2e-5 : GREAT bootstrap and no oscillation + Grad Clip 5m0 + 'run-id': 'WithHERTargetClamping/RepP1/AdamEPS1m12+LR2m5+GradClip5m0/TrainItPerCycle40/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1/Seed10_venv1_r2d2_her_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40', + # LR 1e-4 + tau 4m4 : Weird... + Grad Clip 5m0 + #'run-id': 'WithHERTargetClamping/RepP1/AdamEPS1m12+LR1m4+GradClip5m0+Tau4m4/TrainItPerCycle40/B96k_EpPerCycle16_MLP256_GSflat_GSNotShared_final-1/Seed10_venv1_r2d2_her_Max+Sk0_St1_ObsNone_ClipReward_Eps5p2End2m2_tau40', + + #'agent-id': '1step_double_HER_dqn_smallMLP_r1e5', + #'agent-id': '1step_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # NO Ent reg: + #'agent-id': '1step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # RepP Actually !: + #'agent-id': '1step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepActuallyP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # DQN: + #'agent-id': '1step_double_HER_dqnher_smallMLP_r1e5', + # SImilar R2D2: + #'agent-id': '1step_0Ent_r2d2_AdamLR2d0m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma98_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau2d5m2_RepActuallyP1_NOBURNIN_b128_L10_O5_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + 'agent-id': '1step_0Ent_r2d2_AdamLR2d0m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma98_smallMLP_GradClip5m0_r1p5Min3e4_a6m1_b1m0_ovrN_e9m1_tau2d5m2_RepActuallyP1_NOBURNIN_b128_L2_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'nbr_actor': 128, + #'nbr_actor': 100, + #'nbr_actor': 64, + #'nbr_actor': 32, + #'nbr_actor': 16, + #'nbr_actor': 8, + 'nbr_actor': 1, + 'nbr_frame_skipping': 0, + 'nbr_frame_stacking': 1, + 'grayscale': False, + 'single_life_episode': False, + 'nbr_max_random_steps': 0, + 'clip_reward': True, + + 'sad': False, + #'sad': True, + 'vdn': False, + #'vdn': True, + #"otherplay": True, + "otherplay": False, + + 'previous_reward_action': False, #True, + #'observation_resize_dim': (56,56), + 'observation_resize_dim': None, + 'goal_resize_dim': None, + + # + 'reload': 'None', + }, + ] + + experiment_id: 'r2d2_BitsSwap_Benchmark' + #benchmarking_episodes: 1 + benchmarking_episodes: 10 + benchmarking_interval: 1.0e10 + #benchmarking_interval: 1.0e3 + benchmarking_record_episode_interval: 1.0e8 + #benchmarking_record_episode_interval: 1.0e1 #1.0e20 + train_observation_budget: 96.0e3 + #train_observation_budget: 1.0e7 + seed: 10 + +agents: + 1step_double_HER_dqnher_smallMLP_r1e5: + <<: *dqn_smallMLP_obs84_graclip5m1_b32_tau1m2_lr25m5 + replay_capacity: 1e5 + double: True + #dueling: True + #noisy: True + n_step: 1 + use_HER: True + + gradient_clip: 5.0 #0.5 + batch_size: 128 #32 + tau: 2.5e-2 #1.0e-2 + #tau: 4.0e-4 + #learning_rate: 2.5e-4 + #learning_rate: 1.0e-4 #2.5e-4 + learning_rate: 2.0e-5 + + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + + replay_period: 1 #240 + + nbr_training_iteration_per_cycle: 40 + nbr_episode_per_cycle: 16 + + + + 1step_noisy_double_HER_dqn_smallMLP_r1e5: + <<: *dqn_smallMLP_obs84_graclip5m1_b32_tau1m2_lr25m5 + replay_capacity: 1e5 + double: True + #dueling: True + noisy: True + n_step: 1 + use_HER: True + + 3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_smallMLP_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_smallMLP_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + <<: *r2d2_smallMLP_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + use_HER: True + + actor_models_update_steps_interval: 1 #considering only 1 actor's steps. + + vdn: False + vdn_nbr_players: 2 + + batch_size: 32 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 5.0 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + # replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + # min_capacity: 2e4 #in terms of experiences... #1e4 + replay_capacity: 1e5 + min_capacity: 128 #1e4 + replay_period: 240 #240 + + PER_compute_initial_priority: False + PER_beta_increase_interval: None #2e5 + + double: True + dueling: True + noisy: False + n_step: 3 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: False + sequence_replay_unroll_length: 100 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.05 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + #1step_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #1step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #1step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepActuallyP1_NOBURNIN_b128_L2_O1_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + #1step_0Ent_r2d2_AdamLR2d0m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma98_smallMLP_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau2d5m2_RepActuallyP1_NOBURNIN_b128_L10_O5_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #1step_0Ent_r2d2_AdamLR2d0m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma98_smallMLP_GradClip5m0_r1p5Min3e4_a6m1_b1m0_ovrN_e9m1_tau2d5m2_RepActuallyP1_NOBURNIN_b128_L10_O5_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + 1step_0Ent_r2d2_AdamLR2d0m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma98_smallMLP_GradClip5m0_r1p5Min3e4_a6m1_b1m0_ovrN_e9m1_tau2d5m2_RepActuallyP1_NOBURNIN_b128_L2_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + <<: *3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_smallMLP_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + weights_entropy_lambda: 0.0 + #weights_entropy_lambda: 0.1 + #weights_entropy_lambda: 0.001 #01 + + vdn: False + #vdn: True + vdn_nbr_players: 2 + sad: False + #sad: True + + gamma: 0.98 + #learning_rate: 6.25e-5 + learning_rate: 2.0e-5 + gradient_clip: 5.0 #0.5 + batch_size: 128 #32 + tau: 2.5e-2 #1.0e-2 + + nbr_training_iteration_per_cycle: 40 + nbr_episode_per_cycle: 16 + replay_period: 1 + + #adam_eps: 1.5e-5 + #learning_rate: 1.0e-3 + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + #adam_eps: 1.0e-15 + + n_step: 1 + #n_step: 3 + #n_step: 7 + + #PER: + #PER_alpha: 0.9 + #PER_beta: 0.6 + PER_alpha: 0.6 + PER_beta: 1.0 + + burn_in: False + #burn_in: True + + sequence_replay_unroll_length: 2 #10 + sequence_replay_overlap_length: 0 #5 + sequence_replay_burn_in_length: 0 + # #sequence_replay_burn_in_length: 10 + + # sequence_replay_unroll_length: 100 + # sequence_replay_overlap_length: 50 + # sequence_replay_burn_in_length: 0 + + epsend: 0.4 + eps_greedy_alpha: 2.0 + + # Architecture: + #critic_arch: 'LSTM-RNN' + #critic_arch_hidden_units: [512, 512] + #critic_arch_hidden_units: [512] + #use_relu_after_rnn: False + + # normal arch: + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] + + # Arch2: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: False #True + # use_residual_connection: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_feature_dim: 128 + # critic_arch_hidden_units: [128, 128] + + # Arch 3: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [128] + # critic_arch_feature_dim: 64 + # critic_arch_hidden_units: [64] + + #Arch 4: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_hidden_units: [256] + # critic_arch_linear_post_hidden_units: [256] + # critic_arch_feature_dim: 128 + + # extra_inputs_infos: { + # 'previous_reward':{ + # shape: [1,], + # target_location: ['critic_body', 'extra_inputs'] + # }, + # 'previous_action':{ + # shape: ['task.action_dim',], + # target_location: ['critic_body', 'extra_inputs'] + # }, + # # 'action_mask':{ + # # shape: ['task.action_dim',], + # # target_location: ['critic_body', 'extra_inputs'] + # # }, + # ######################## + # ######################## + # ######################## + # # WITH SAD: + # ######################## + # # 'greedy_action':{ + # # #shape: [23], + # # shape: [108], + # # #shape: [43], + # # #shape: [58,], + # # target_location: ['critic_body', 'extra_inputs'] + # # }, + # ######################## + # ######################## + # # 'legal_actions':{ + # # shape: ['task.action_dim',], + # # target_location: ['head', 'extra_inputs'] + # # }, + + # } + + + + + \ No newline at end of file diff --git a/benchmark/R2D2/CoMaze/benchmark_selfplay_comaze.py b/benchmark/R2D2/CoMaze/benchmark_selfplay_comaze.py index 4f2ced2b..0b4aac22 100644 --- a/benchmark/R2D2/CoMaze/benchmark_selfplay_comaze.py +++ b/benchmark/R2D2/CoMaze/benchmark_selfplay_comaze.py @@ -1,4 +1,8 @@ from typing import Dict, Any, Optional, List, Callable + +import torch +import sklearn + import logging import yaml import os @@ -6,6 +10,8 @@ from typing import Dict import torch.multiprocessing + +from tensorboardX import SummaryWriter from tqdm import tqdm from functools import partial @@ -25,6 +31,292 @@ import ray +from regym.modules import EnvironmentModule, CurrentAgentsModule +from regym.modules import MARLEnvironmentModule, RLAgentModule + +from regym.modules import MultiStepCICMetricModule +from rl_action_policy import RLActionPolicy +from comaze_gym.metrics import MultiStepCIC, RuleBasedActionPolicy + +from regym.modules import MessageTrajectoryMutualInformationMetricModule +from rl_message_policy import RLMessagePolicy +from comaze_gym.metrics import MessageTrajectoryMutualInformationMetric, RuleBasedMessagePolicy + +from regym.modules import CoMazeGoalOrderingPredictionModule +from rl_hiddenstate_policy import RLHiddenStatePolicy +from comaze_gym.metrics import GoalOrderingPredictionMetric, RuleBasedHiddenStatePolicy + +from regym.pubsub_manager import PubSubManager + +def make_rl_pubsubmanager( + agents, + config, + ms_cic_metric=None, + m_traj_mutual_info_metric=None, + goal_order_pred_metric=None, + logger=None, + load_path=None, + save_path=None): + """ + Create a PubSubManager. + :param agents: List of Agents to use in the rl loop. + :param config: Dict that specifies all the important hyperparameters of the network. + - "task" + - "sad" + - "vdn" + - "otherplay" + - "max_obs_count" + - "sum_writer": str where to save the summary... + + """ + pipelined = False + if len(sys.argv) > 2: + pipelined = any(['pipelined' in arg for arg in sys.argv]) + print(f"Pipelined: {pipelined}") + + modules = config.pop("modules") + + cam_id = "current_agents" + modules[cam_id] = CurrentAgentsModule( + id=cam_id, + agents=agents + ) + + if pipelined: + envm_id = "MARLEnvironmentModule_0" + envm_input_stream_ids = { + "iteration":"signals:iteration", + "current_agents":f"modules:{cam_id}:ref", + } + + rlam_ids = [ + f"rl_agent_{rlaidx}" + for rlaidx in range(len(agents)) + ] + for aidx, (rlam_id, agent) in enumerate(zip(rlam_ids, agents)): + rlam_config = { + 'agent': agent, + 'actions_stream_id':f"modules:{envm_id}:player_{aidx}:actions", + } + + envm_input_stream_ids[f'player_{aidx}'] = f"modules:{rlam_id}:ref" + + rlam_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "reset_actors":f"modules:{envm_id}:reset_actors", + + "observations":f"modules:{envm_id}:ref:player_{aidx}:observations", + "infos":f"modules:{envm_id}:ref:player_{aidx}:infos", + "actions":f"modules:{envm_id}:ref:player_{aidx}:actions", + "succ_observations":f"modules:{envm_id}:ref:player_{aidx}:succ_observations", + "succ_infos":f"modules:{envm_id}:ref:player_{aidx}:succ_infos", + "rewards":f"modules:{envm_id}:ref:player_{aidx}:rewards", + "dones":f"modules:{envm_id}:ref:player_{aidx}:dones", + } + modules[rlam_id] = RLAgentModule( + id=rlam_id, + config=rlam_config, + input_stream_ids=rlam_input_stream_ids, + ) + + modules[envm_id] = MARLEnvironmentModule( + id=envm_id, + config=config, + input_stream_ids=envm_input_stream_ids + ) + else: + envm_id = "EnvironmentModule_0" + envm_input_stream_ids = { + #"logger":"modules:logger:ref", + #"logs_dict":"logs_dict", + + "iteration":"signals:iteration", + + "current_agents":f"modules:{cam_id}:ref", + } + modules[envm_id] = EnvironmentModule( + id=envm_id, + config=config, + input_stream_ids=envm_input_stream_ids + ) + + ms_cic_id = "MultiStepCIC_player0" + ms_cic_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + listening_biasing = False + if len(sys.argv) > 2: + listening_biasing = any(['listening_biasing' in arg for arg in sys.argv[2:]]) + + if listening_biasing: + print("WARNING: Biasing for positive listening.") + else: + print("WARNING: NOT biasing for positive listening.") + + ms_cic_config = { + "biasing":listening_biasing, + "nbr_players":len(agents), + "player_id":0, + "metric":ms_cic_metric, #if None: default constr. for rule based agent... + #"message_zeroing_out_fn"= ... + } + + if ms_cic_metric is not None: + modules[ms_cic_id] = MultiStepCICMetricModule( + id=ms_cic_id, + config=ms_cic_config, + input_stream_ids=ms_cic_input_stream_ids, + ) + + m_traj_mutinfo_id = "MessageTrajectoryMutualInforMetric_player0" + m_traj_mutinfo_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + signalling_biasing = False + if len(sys.argv) > 2: + signalling_biasing = any(['signalling_biasing' in arg for arg in sys.argv[2:]]) + + if signalling_biasing: + print("WARNING: Biasing for positive signalling.") + else: + print("WARNING: NOT biasing for positive signalling.") + + m_traj_mutinfo_config = { + "biasing":signalling_biasing, + "nbr_players":len(agents), + "player_id":0, + "metric":m_traj_mutual_info_metric, #if None: default constr. for rule based agent... + #"message_zeroing_out_fn"= ... + } + + if m_traj_mutual_info_metric is not None: + modules[m_traj_mutinfo_id] = MessageTrajectoryMutualInformationMetricModule( + id=m_traj_mutinfo_id, + config=m_traj_mutinfo_config, + input_stream_ids=m_traj_mutinfo_input_stream_ids, + ) + + + goal_order_pred_id = "GoalOrderingPred_player0" + goal_order_pred_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + goal_ordering_biasing = False + if len(sys.argv) > 2: + goal_ordering_biasing = any(['goal_ordering_biasing' in arg for arg in sys.argv[2:]]) + + if goal_ordering_biasing: + print("WARNING: Biasing for Goal Ordering Prediction.") + else: + print("WARNING: NOT biasing for Goal Ordering Prediction.") + + goal_order_pred_config = { + "biasing":goal_ordering_biasing, + "nbr_players":len(agents), + "player_id":0, + "metric":goal_order_pred_metric, #if None: default constr. for rule based agent... + } + + if goal_order_pred_metric is not None: + modules[goal_order_pred_id] = CoMazeGoalOrderingPredictionModule( + id=goal_order_pred_id, + config=goal_order_pred_config, + input_stream_ids=goal_order_pred_input_stream_ids, + ) + + + + pipelines = config.pop("pipelines") + + pipelines["rl_loop_0"] = [ + envm_id, + ] + if pipelined: + for rlam_id in rlam_ids: + pipelines['rl_loop_0'].append(rlam_id) + + if ms_cic_metric is not None: + pipelines["rl_loop_0"].append(ms_cic_id) + if m_traj_mutual_info_metric is not None: + pipelines["rl_loop_0"].append(m_traj_mutinfo_id) + if goal_order_pred_metric is not None: + pipelines["rl_loop_0"].append(goal_order_pred_id) + + optim_id = "global_optim" + optim_config = { + "modules":modules, + "learning_rate":3e-4, + "optimizer_type":'adam', + "with_gradient_clip":False, + "adam_eps":1e-16, + } + + optim_module = regym.modules.build_OptimizationModule( + id=optim_id, + config=optim_config, + ) + modules[optim_id] = optim_module + + logger_id = "per_epoch_logger" + logger_module = regym.modules.build_PerEpochLoggerModule(id=logger_id) + modules[logger_id] = logger_module + + pipelines[optim_id] = [] + pipelines[optim_id].append(optim_id) + pipelines[optim_id].append(logger_id) + + pbm = PubSubManager( + config=config, + modules=modules, + pipelines=pipelines, + logger=logger, + load_path=load_path, + save_path=save_path, + ) + + return pbm + + def comaze_r2d2_wrap( env, clip_reward=False, @@ -65,52 +357,104 @@ def train_and_evaluate(agents: List[object], test_obs_interval: int = 1e4, test_nbr_episode: int = 10, benchmarking_record_episode_interval: int = None, + render_mode="rgb_array", step_hooks=[], sad=False, - vdn=False): - - async = False + vdn=False, + otherplay=False, + ms_cic_metric=None, + m_traj_mutual_info_metric=None, + goal_order_pred_metric=None): + pubsub = False if len(sys.argv) > 2: - async = any(['async' in arg for arg in sys.argv]) - - if async: - trained_agent = marl_loop.async_gather_experience_parallel1( - #trained_agents = marl_loop.async_gather_experience_parallel( - task, - agents, - training=True, - #nbr_pretraining_steps=nbr_pretraining_steps, - max_obs_count=nbr_max_observations, - env_configs=None, - sum_writer=sum_writer, - base_path=base_path, - test_obs_interval=test_obs_interval, - test_nbr_episode=test_nbr_episode, - benchmarking_record_episode_interval=benchmarking_record_episode_interval, - save_traj_length_divider=1, - step_hooks=step_hooks, - sad=sad, - vdn=vdn, - ) - else: - trained_agents = marl_loop.gather_experience_parallel( - task, - agents, - training=True, - #nbr_pretraining_steps=nbr_pretraining_steps, - max_obs_count=nbr_max_observations, - env_configs=None, - sum_writer=sum_writer, - base_path=base_path, - test_obs_interval=test_obs_interval, - test_nbr_episode=test_nbr_episode, - benchmarking_record_episode_interval=benchmarking_record_episode_interval, - save_traj_length_divider=1, - step_hooks=step_hooks, - sad=sad, - vdn=vdn, + pubsub = any(['pubsub' in arg for arg in sys.argv]) + + if pubsub: + config = { + "modules": {}, + "pipelines": {}, + } + + config['training'] = True + config['env_configs'] = None + config['task'] = task + + sum_writer_path = os.path.join(sum_writer, 'actor.log') + sum_writer = config['sum_writer'] = SummaryWriter(sum_writer_path, flush_secs=1) + + config['base_path'] = base_path + config['offset_episode_count'] = offset_episode_count + config['nbr_pretraining_steps'] = nbr_pretraining_steps + config['max_obs_count'] = nbr_max_observations + config['test_obs_interval'] = test_obs_interval + config['test_nbr_episode'] = test_nbr_episode + config['benchmarking_record_episode_interval'] = benchmarking_record_episode_interval + config['render_mode'] = render_mode + config['step_hooks'] = step_hooks + config['save_traj_length_divider'] =1 + config['sad'] = sad + config['vdn'] = vdn + config['otherplay'] = otherplay + config['nbr_players'] = 2 + pubsubmanager = make_rl_pubsubmanager( + agents=agents, + config=config, + ms_cic_metric=ms_cic_metric, + m_traj_mutual_info_metric=m_traj_mutual_info_metric, + goal_order_pred_metric=goal_order_pred_metric, + logger=sum_writer, ) + pubsubmanager.train() + + trained_agents = agents + else: + asynch = False + if len(sys.argv) > 2: + asynch = any(['async' in arg for arg in sys.argv]) + + if asynch: + trained_agent = marl_loop.async_gather_experience_parallel1( + #trained_agents = marl_loop.async_gather_experience_parallel( + task, + agents, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + sad=sad, + vdn=vdn, + otherplay=otherplay, + ) + else: + trained_agents = marl_loop.gather_experience_parallel( + task, + agents, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + sad=sad, + vdn=vdn, + otherplay=otherplay + ) + save_replay_buffer = False if len(sys.argv) > 2: save_replay_buffer = any(['save_replay_buffer' in arg for arg in sys.argv]) @@ -133,8 +477,124 @@ def training_process(agent_config: Dict, train_observation_budget: int = 1e7, base_path: str = './', seed: int = 0): + + test_only = False + augmented = False + path_suffix = None + use_ms_cic = False + use_m_traj_mutual_info = False + use_goal_order_pred = False + combined_action_space = False + signalling_biasing = False + listening_biasing = False + goal_ordering_biasing = False + pubsub = False + if len(sys.argv) > 2: + pubsub = any(['pubsub' in arg for arg in sys.argv]) + test_only = any(['test_only' in arg for arg in sys.argv]) + use_ms_cic = any(['ms_cic' in arg for arg in sys.argv]) + use_m_traj_mutual_info = any(['mutual_info' in arg for arg in sys.argv]) + use_goal_order_pred = any(['goal_order' in arg for arg in sys.argv]) + combined_action_space = any(['combined_action_space' in arg for arg in sys.argv]) + signalling_biasing = any(['signalling_biasing' in arg for arg in sys.argv[2:]]) + listening_biasing = any(['listening_biasing' in arg for arg in sys.argv[2:]]) + goal_ordering_biasing = any(['goal_ordering_biasing' in arg for arg in sys.argv[2:]]) + + if use_goal_order_pred: + augmented = any(['augmented' in arg for arg in sys.argv[2:] if 'goal_order' in arg]) + + override_seed_argv_idx = [idx for idx, arg in enumerate(sys.argv) if '--seed' in arg] + if len(override_seed_argv_idx): + seed = int(sys.argv[override_seed_argv_idx[0]+1]) + print(f"NEW RANDOM SEED: {seed}") + + override_reload_argv = [idx for idx, arg in enumerate(sys.argv) if '--reload_path' in arg] + if len(override_reload_argv): + task_config["reload"] = sys.argv[override_reload_argv[0]+1] + print(f"NEW RELOAD PATH: {task_config['reload']}") + + path_suffix_argv = [idx for idx, arg in enumerate(sys.argv) if '--path_suffix' in arg] + if len(path_suffix_argv): + path_suffix = sys.argv[path_suffix_argv[0]+1] + print(f"ADDITIONAL PATH SUFFIX: {path_suffix}") + + obs_budget_argv = [idx for idx, arg in enumerate(sys.argv) if '--obs_budget' in arg] + if len(obs_budget_argv): + train_observation_budget = int(sys.argv[obs_budget_argv[0]+1]) + print(f"TRAINING OBSERVATION BUDGET: {train_observation_budget}") + + + task_config["otherplay"] = any(['--otherplay' in arg for arg in sys.argv[2:]]) + + ms_cic_metric = None + m_traj_mutual_info_metric = None + goal_order_pred_metric = None + + if test_only: + base_path = os.path.join(base_path,"TESTING") + else: + base_path = os.path.join(base_path,"TRAINING") + + if pubsub: + base_path = os.path.join(base_path,"PUBSUB") + else: + base_path = os.path.join(base_path,"NOPUBSUB") + + if use_ms_cic: + base_path = os.path.join(base_path,f"MS-CIC{'+CombActSpace' if combined_action_space else ''}{'+Biasing-1m4-f1m1' if listening_biasing else ''}") + if use_m_traj_mutual_info: + base_path = os.path.join(base_path,f"MessTraj-MutualInfoMetric{'+CombActSpace' if combined_action_space else ''}{'+Biasing-1m0-f1m1' if signalling_biasing else ''}") + if use_goal_order_pred: + base_path = os.path.join(base_path,f"GoalOrderingPred{'+Biasing-1m0' if goal_ordering_biasing else ''}-NoDropout+RulesPredictionONLY+RNNStatePostProcess{'+AugmentedHiddenStates' if augmented else ''}") + #base_path = os.path.join(base_path,f"GoalOrderingPred{'+Biasing-1m0' if goal_ordering_biasing else ''}-NoDropout+GoalOrderingPredictionONLY+RNNStatePostProcess{'+AugmentedHiddenStates' if augmented else ''}") + #base_path = os.path.join(base_path,f"GoalOrderingPred-AfterEpoch50-{'+Biasing-1m0' if goal_ordering_biasing else ''}-NoDropout+RulesPredictionONLY+RNNStatePostProcess{'+AugmentedHiddenStates' if augmented else ''}") + + rule_based = False + communicating = False + if len(sys.argv) > 2: + rule_based = any(['rule_based' in arg for arg in sys.argv[2:]]) + communicating = any(['communicating_rule_based' in arg for arg in sys.argv[2:]]) + if rule_based: + base_path = os.path.join(base_path,f"{'COMM-' if communicating else ''}RULEBASE") + + if task_config["otherplay"]: + base_path = os.path.join(base_path,"OtherPlay") + + base_path = os.path.join(base_path,f"SEED{seed}") + + if path_suffix is not None: + base_path = os.path.join(base_path, path_suffix) + + print(f"Final Path: -- {base_path} --") + import ipdb; ipdb.set_trace() + + if rule_based: + print("rule-based agents do not usee SAD nor VDN...") + agent_config["sad"] = False + agent_config["vdn"] = False + task_config["sad"] = False + task_config["vdn"] = False + if not os.path.exists(base_path): os.makedirs(base_path) + task_config['final_path'] = base_path + task_config['command_line'] = ' '.join(sys.argv) + print(task_config['command_line']) + yaml.dump( + task_config, + open( + os.path.join(base_path, "task_config.yaml"), 'w', + encoding='utf8', + ), + ) + yaml.dump( + agent_config, + open( + os.path.join(base_path, "agent_config.yaml"), 'w', + encoding='utf8', + ), + ) + np.random.seed(seed) torch.manual_seed(seed) random.seed(seed) @@ -181,13 +641,14 @@ def training_process(agent_config: Dict, agent_config['nbr_actor'] = task_config['nbr_actor'] regym.RegymSummaryWriterPath = base_path #regym.RegymSummaryWriter = GlobalSummaryWriter(base_path) - sum_writer = base_path + sum_writer = base_path - #base_path1 = os.path.join(base_path,"1") - #save_path1 = os.path.join(base_path1,f"./{task_config['agent-id']}.agent") save_path1 = os.path.join(base_path,f"./{task_config['agent-id']}.agent") + if task_config.get("reload", 'None')!='None': + agent, offset_episode_count = check_path_for_agent(task_config["reload"]) + else: + agent, offset_episode_count = check_path_for_agent(save_path1) - agent, offset_episode_count = check_path_for_agent(save_path1) if agent is None: agent = initialize_agents( task=task, @@ -195,24 +656,29 @@ def training_process(agent_config: Dict, )[0] agent.save_path = save_path1 - """ - base_path2 = os.path.join(base_path,"2") - save_path2 = os.path.join(base_path2,f"./{task_config['agent-id']}.agent") + if test_only: + print(save_path1) + agent.training = False + + if use_ms_cic: + action_policy = RLActionPolicy( + agent=agent, + combined_action_space=combined_action_space, + ) + if use_m_traj_mutual_info: + message_policy = RLMessagePolicy( + agent=agent, + combined_action_space=combined_action_space, + ) + + if use_goal_order_pred: + hiddenstate_policy = RLHiddenStatePolicy( + agent=agent, + augmented=augmented, + ) - agent2, offset_episode_count = check_path_for_agent(save_path2) - if agent2 is None: - agent2 = initialize_agents( - task=task, - agent_configurations={task_config['agent-id']: agent_config} - )[0] - agent2.save_path = save_path2 - """ - - #agents = [agent, agent2] - if "vdn" in agent_config \ and agent_config["vdn"]: - import ipdb; ipdb.set_trace() agents = [agent] else: player2_harvest = False @@ -220,7 +686,6 @@ def training_process(agent_config: Dict, if len(sys.argv) > 2: player2_harvest = any(['player2_harvest' in arg for arg in sys.argv]) - import ipdb; ipdb.set_trace() agents = [agent, agent.get_async_actor(training=player2_harvest)] # We can create non-training or training async actors. # If traininging, then their experience is added to the replay buffer @@ -228,6 +693,55 @@ def training_process(agent_config: Dict, # -given that it proposes decorrelated data-, but it may # also have unknown disadvantages. Needs proper investigation. + if rule_based: + import importlib + comaze_gym = importlib.import_module("regym.environments.envs.CoMaze.comaze-gym.comaze_gym") + from comaze_gym import build_WrappedActionOnlyRuleBasedAgent, build_WrappedCommunicatingRuleBasedAgent + build_fn = build_WrappedActionOnlyRuleBasedAgent + if communicating: + build_fn = build_WrappedCommunicatingRuleBasedAgent + agents = [ + build_fn( + player_idx=pidx, + action_space_dim=task.action_dim, + ) for pidx in range(2) + ] + + if use_ms_cic: + action_policy = RuleBasedActionPolicy( + wrapped_rule_based_agent=agents[0], + combined_action_space=combined_action_space, + ) + if use_m_traj_mutual_info: + message_policy = RuleBasedMessagePolicy( + wrapped_rule_based_agent=agents[0], + combined_action_space=combined_action_space, + ) + if use_goal_order_pred: + hiddenstate_policy = RuleBasedHiddenStatePolicy( + wrapped_rule_based_agent=agents[0], + ) + + if use_ms_cic: + ms_cic_metric = MultiStepCIC( + action_policy=action_policy, + action_policy_bar=RLActionPolicy( + agent=agent, + combined_action_space=combined_action_space, + ) + ) + if use_m_traj_mutual_info: + m_traj_mutual_info_metric = MessageTrajectoryMutualInformationMetric( + message_policy=message_policy, + ) + if use_goal_order_pred: + goal_order_pred_metric = GoalOrderingPredictionMetric( + hiddenstate_policy=hiddenstate_policy, + label_dim=4*5, + data_save_path=os.path.join(base_path,"GoalOrderingPredModule"), + use_cuda=agent_config['use_cuda'], + ) + trained_agents = train_and_evaluate( agents=agents, task=task, @@ -238,11 +752,15 @@ def training_process(agent_config: Dict, nbr_max_observations=train_observation_budget, test_obs_interval=benchmarking_interval, test_nbr_episode=benchmarking_episodes, - benchmarking_record_episode_interval=None, - #Need to solve an issue with save_traj_with_graph fnbenchmarking_record_episode_interval, - #benchmarking_record_episode_interval=benchmarking_record_episode_interval, - sad=task_config["sad"], - vdn=task_config["vdn"], + #benchmarking_record_episode_interval=None, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + render_mode="human_comm", + sad=task_config["sad"] if not(rule_based) else False, + vdn=task_config["vdn"] if not(rule_based) else False, + otherplay=task_config.get("otherplay", False), + ms_cic_metric=ms_cic_metric, + m_traj_mutual_info_metric=m_traj_mutual_info_metric, + goal_order_pred_metric=goal_order_pred_metric, ) return trained_agents, task @@ -274,21 +792,21 @@ def main(): env_name = task_config['env-id'] run_name = task_config['run-id'] path = f'{base_path}/{env_name}/{run_name}/{agent_name}' - print(f"Path: -- {path} --") + print(f"Tentative Path: -- {path} --") training_process(agents_config[task_config['agent-id']], task_config, benchmarking_interval=int(float(experiment_config['benchmarking_interval'])), benchmarking_episodes=int(float(experiment_config['benchmarking_episodes'])), - benchmarking_record_episode_interval=int(float(experiment_config['benchmarking_record_episode_interval'])), + benchmarking_record_episode_interval=int(float(experiment_config['benchmarking_record_episode_interval'])) if experiment_config['benchmarking_record_episode_interval']!='None' else None, train_observation_budget=int(float(experiment_config['train_observation_budget'])), base_path=path, seed=experiment_config['seed']) if __name__ == '__main__': - async = False + asynch = False __spec__ = None if len(sys.argv) > 2: - async = any(['async' in arg for arg in sys.argv]) - if async: + asynch = any(['async' in arg for arg in sys.argv]) + if asynch: torch.multiprocessing.freeze_support() torch.multiprocessing.set_start_method("forkserver", force=True) #torch.multiprocessing.set_start_method("spawn", force=True) diff --git a/benchmark/R2D2/CoMaze/comaze_communicating_rule_based_benchmark_config.yaml b/benchmark/R2D2/CoMaze/comaze_communicating_rule_based_benchmark_config.yaml new file mode 100644 index 00000000..4768b870 --- /dev/null +++ b/benchmark/R2D2/CoMaze/comaze_communicating_rule_based_benchmark_config.yaml @@ -0,0 +1,698 @@ +extra_hyperparameters: &extra_hyperparameters + lr_account_for_nbr_actor: False + weights_decay_lambda: 0.0 + weights_entropy_lambda: 0.0 #01 + use_target_to_gather_data: False + + #################################### + # New hyperparameters: + PER_compute_initial_priority: False + ##################################### + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: True + sequence_replay_unroll_length: 80 + sequence_replay_overlap_length: 40 + sequence_replay_burn_in_length: 20 + + sequence_replay_PER_eta: 0.9 + + vdn: False + vdn_nbr_players: 2 + +LargeCNNMLP: &LargeCNNMLP + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_kernels: [8, 4, 3] + phi_arch_strides: [4, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + },'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + +LargeCNNMLP_SAD: &LargeCNNMLP_SAD + sad: True + + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_channels: [32, 64, 64] + #phi_arch_kernels: [8, 4, 3] + phi_arch_kernels: [3, 3, 3] + #phi_arch_strides: [4, 2, 1] + phi_arch_strides: [2, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + + +r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: False + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP + <<: *extra_hyperparameters + +r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: False + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP_SAD + <<: *extra_hyperparameters + + +experiment: + tasks: [{ + #'env-id': 'CoMaze-7x7-Dense-v0', + #'env-id': 'CoMaze-7x7-Dense-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-EasySecrets-v0', + 'env-id': 'CoMaze-9x9-Dense-Level5-HardSecrets-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-UniformSecrets-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-FixedSecretGoalRules-v0', + #'env-id': 'CoMaze-7x7-Dense-FixedActions-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayer-v0', + #'env-id': 'CoMaze-7x7-Dense-Easy-SinglePlayer-v0', + + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs21_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # More actors? fails to learn: + # maybe the trajectories are being replaced too fast in the replay-buffer: + # the replay occurrence for each trajectory is reduced... + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? /4 ==> 4*replay_ratio : great data-efficiency!!! + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + + #VDN: FixedAction + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23-debug1VDNLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/400MaxSteps/NOSAD/debugVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP-VocabOnly/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/PUBSUBDEBUG/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + 'run-id': 'serial/DEBUGRULEBASED+CIC/CommBIS+DEBUGWithMessages+TrainingUncondWithNLL+MultiSGD+KineticActionProperSummingAndLogSoft/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-SQUARE/Penalty+Vocab20/NOOP/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/400MaxSteps/NOSAD/debugNOVDNLoss+NOSumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? ? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Observation space? Visibly better! + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/SeedRep1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/testRecording/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # Reducing nbr actor to increase experience replay time: /8 ==> replay-occurences*8? + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv8_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is the overlap useful? no + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is a different sequence length better? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # More exploration? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Bigger batch-size? + # Different explo: not sufficient, but at least there is maxtrainingreward remains high... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Different archi with relu after rnns? not clear yet whether archi or relu... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # The above arch is definitively learning as there are picks of in mean episode length occuring. + # Let us try to make it learn faster then: increasing lr and decreasing eps: + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # What about only decreasing eps: the most important element so far!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but even lower eps: huge gains!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Increasing nstep return? does not show specific improvement... + #'agent-id': '7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # burn-in? very sharp progress, but requires more update (since some values are not used in the loss...) + # early catastrophic forgetting phenomenon...! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone', + + # VDN: + # normal arch: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with entropy weight 0.1: + # Stable learning ! but cannot learn to communicate... + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + agent-id observation: + #'agent-id': '3step_VDN_1m1Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Trying to learn to communicate by enlarging the seq len: + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # SAD only: + #'agent-id': '3step_SAD_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 100: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 20: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 2: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + res con : + #'agent-id': '3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + 'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #SAD+VDN: plateauing at 8 in singleplayerreward + #'agent-id': '3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # + agent id: does not provide any help compared to SAD: + #'agent-id': '3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 4: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM4_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau1m5_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # New archi with extra input via fcn and more fc layers: higher pick than normal arch, but lr is too high visibly, the loss diverges... + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with low eps and low lr, new arch: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #archi 3: very small eps, minimal fcn + extra input on fcn: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # Normal archi but with a relu: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with a relu + only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'nbr_actor': 128, + 'nbr_actor': 64, + #'nbr_actor': 32, + #'nbr_actor': 16, + #'nbr_actor': 8, + #'nbr_actor': 1, + #'nbr_frame_skipping': 4, + #'nbr_frame_stacking': 4, + #'grayscale': True, + #'single_life_episode': True, #False, + #'nbr_max_random_steps': 30, + 'sad': False, + #'sad': True, + 'vdn': False, + #'vdn': True, + #"otherplay": True, + "otherplay": False, + 'clip_reward': False, + 'previous_reward_action': True, + #'observation_resize_dim': (21,21), #(56,56), + 'observation_resize_dim': (56,56), + }, + ] + experiment_id: 'r2d2_comaze_debug' + benchmarking_episodes: 10 + benchmarking_interval: 1.0e3 + benchmarking_record_episode_interval: 1.0e3 #1.0e20 + #benchmarking_record_episode_interval: 1.0e20 + train_observation_budget: 1.0e7 + #seed: 1 + seed: 2 + +agents: + SAD_IQL_paper_1step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate100Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m1_r2p4Min1e4_alpha9m1_beta6m1_over2e4_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0: + <<: *r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 100 #considering only 1 actor's steps. + + batch_size: 128 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 0.5 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 2e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 1e4 #in terms of experiences... #1e4 + + PER_beta_increase_interval: 2e4 + + double: True + dueling: True + noisy: False + + n_step: 1 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 2 + sequence_replay_overlap_length: 1 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.4 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + <<: *r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 1 #considering only 1 actor's steps. + + vdn: True + vdn_nbr_players: 2 + + batch_size: 32 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 5.0 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 2e4 #in terms of experiences... #1e4 + + PER_compute_initial_priority: False + PER_beta_increase_interval: None #2e5 + + double: True + dueling: True + noisy: False + n_step: 3 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: False + sequence_replay_unroll_length: 100 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.05 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + # New archi with relu on rnn: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with higher lr and lower eps for adam opt: + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # only lower eps: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # even lower eps: best version so far! BEST: + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #burn-in? + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone: + # increase nstep return? + #7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # VDN: + #3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2 + res. con.: + #3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # replay buffer 5p4 -> 1p5 + 3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # seq 100 + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # arch 3 with very low eps: minimal arch + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # New architecture with more FC layers and extra inputs are feed to those (rather than rnn): + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with low eps and low lr: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + only one rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with only on rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + # DEBUG: + min_capacity: 1e3 + #weights_entropy_lambda: 0.0 + #weights_entropy_lambda: 0.1 + weights_entropy_lambda: 0.001 #01 + + vdn: False + #vdn: True + vdn_nbr_players: 2 + sad: False + #sad: True + + + learning_rate: 6.25e-5 + #adam_eps: 1.5e-5 + #learning_rate: 1.0e-3 + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + #adam_eps: 1.0e-15 + + #replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + replay_capacity: 1e5 + min_capacity: 2e4 #in terms of experiences... #1e4 + + n_step: 3 + #n_step: 7 + + #tau: 4.0e-4 + #tau: 1.0e-5 + + #sequence_replay_overlap_length: 0 + #sequence_replay_overlap_length: 50 + + batch_size: 128 + + burn_in: False + #burn_in: True + + sequence_replay_unroll_length: 20 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + # #sequence_replay_burn_in_length: 10 + + # sequence_replay_unroll_length: 100 + # sequence_replay_overlap_length: 50 + # sequence_replay_burn_in_length: 0 + + epsend: 0.4 + eps_greedy_alpha: 2.0 + + # Architecture: + #critic_arch: 'LSTM-RNN' + #critic_arch_hidden_units: [512, 512] + #critic_arch_hidden_units: [512] + #use_relu_after_rnn: False + + # normal arch: + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] + + # Arch2: + critic_arch: 'MLP-LSTM-RNN2' + use_relu_after_rnn: False #True + use_residual_connection: True + critic_arch_linear_hidden_units: [512, 256] + critic_arch_feature_dim: 128 + critic_arch_hidden_units: [128, 128] + + # Arch 3: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [128] + # critic_arch_feature_dim: 64 + # critic_arch_hidden_units: [64] + + #Arch 4: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_hidden_units: [256] + # critic_arch_linear_post_hidden_units: [256] + # critic_arch_feature_dim: 128 + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + #shape: [4], + shape: [21], + #shape: [8], + #shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH AGENT_ID: + ######################## + 'agent_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + #shape: [23], + shape: [108], + #shape: [43], + #shape: [58,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + + + + \ No newline at end of file diff --git a/benchmark/R2D2/CoMaze/comaze_cross_play_test_config.yaml b/benchmark/R2D2/CoMaze/comaze_cross_play_test_config.yaml new file mode 100644 index 00000000..05496ac7 --- /dev/null +++ b/benchmark/R2D2/CoMaze/comaze_cross_play_test_config.yaml @@ -0,0 +1,24 @@ +# NOTE: relative path +population_path: 'random_agents/' +save_path: './cross_play_matrix.pickle' +agents: { + 'comm_RB_seed1': 1, + 'comm_RB_seed2': 2, + 'comm_RB_seed3': 3, + } +num_games_per_matchup: 10 +num_matrices: 1 + +seed: 111 +task: { + 'env-id': 'CoMaze-9x9-Dense-Level5-UniformSecrets-v0', + + 'run-id': 'serial/crossplay/WithBN/ScalingFN_EPS1m3/Seed1_venv64_r2d2_EntropyReg0_WeightDecayReg0/', + 'agent-id': 'SAD_SAD_paper_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m1_r5p4Min2e4_alpha9m1_beta6m1_over2e4_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0', + + 'nbr_actor': 64, + 'sad': True, + 'clip_reward': False, + 'previous_reward_action': True, + 'observation_resize_dim': (56,56), + } diff --git a/benchmark/R2D2/CoMaze/comaze_r2d2_benchmark_config.yaml b/benchmark/R2D2/CoMaze/comaze_r2d2_benchmark_config.yaml index 23727553..1fee6a46 100644 --- a/benchmark/R2D2/CoMaze/comaze_r2d2_benchmark_config.yaml +++ b/benchmark/R2D2/CoMaze/comaze_r2d2_benchmark_config.yaml @@ -244,7 +244,11 @@ r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNN experiment: tasks: [{ #'env-id': 'CoMaze-7x7-Dense-v0', - 'env-id': 'CoMaze-7x7-Dense-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-EasySecrets-v0', + 'env-id': 'CoMaze-9x9-Dense-Level5-HardSecrets-v0', #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-v0', #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-v0', #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-FixedSecretGoalRules-v0', @@ -271,7 +275,9 @@ experiment: #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23-debug1VDNLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', #'run-id': 'serial/selfplay/400MaxSteps/NOSAD/debugVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', - 'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1/debugOP/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermReg/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP-VocabOnly/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + 'run-id': 'serial/DEBUGRULEBASED+CIC/RL+Comm+DebugVDN+DebugSAD+ListeningBiasing/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/400MaxSteps/NOSAD/debugNOVDNLoss+NOSumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', @@ -331,8 +337,15 @@ experiment: # seq 100: #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', # seq 20: - 'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', - # SAD+VDN: plateauing at 8 in singleplayerreward + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 2: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + res con : + # definitively an issue with entroppy dropping out... + #'agent-id': '3step_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + 'agent-id': '3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #SAD+VDN: plateauing at 8 in singleplayerreward #'agent-id': '3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', # + agent id: does not provide any help compared to SAD: @@ -375,8 +388,8 @@ experiment: 'sad': True, #'vdn': False, 'vdn': True, - "otherplay": True, - #"otherplay": False, + #"otherplay": True, + "otherplay": False, 'clip_reward': False, 'previous_reward_action': True, #'observation_resize_dim': (21,21), #(56,56), @@ -513,7 +526,13 @@ agents: #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: #3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: # With agent id: - 3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2 + res. con.: + #3step_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + 3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: # seq 100 #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: #3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: @@ -534,8 +553,8 @@ agents: <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone # DEBUG: min_capacity: 1e3 - weights_entropy_lambda: 0.0 - #weights_entropy_lambda: 0.1 + #weights_entropy_lambda: 0.0 + weights_entropy_lambda: 0.1 #weights_entropy_lambda: 0.001 #01 #vdn: False @@ -586,18 +605,19 @@ agents: #use_relu_after_rnn: False # normal arch: - critic_arch: 'MLP-LSTM-RNN' - use_relu_after_rnn: True - #use_relu_after_rnn: False - critic_arch_feature_dim: 512 - critic_arch_hidden_units: [512] + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] # Arch2: - # critic_arch: 'MLP-LSTM-RNN2' - # use_relu_after_rnn: True - # critic_arch_linear_hidden_units: [512, 256] - # critic_arch_feature_dim: 128 - # critic_arch_hidden_units: [128] + critic_arch: 'MLP-LSTM-RNN2' + use_relu_after_rnn: False #True + use_residual_connection: True + critic_arch_linear_hidden_units: [512, 256] + critic_arch_feature_dim: 128 + critic_arch_hidden_units: [128, 128] # Arch 3: # critic_arch: 'MLP-LSTM-RNN2' @@ -628,7 +648,10 @@ agents: target_location: ['critic_body', 'extra_inputs'] }, 'communication_channel':{ - shape: [11,], + #shape: [4], + shape: [21], + #shape: [8], + #shape: [11,], target_location: ['critic_body', 'extra_inputs'] }, 'secret_goal_rule':{ @@ -647,10 +670,13 @@ agents: ######################## # WITH SAD: ######################## - 'greedy_action':{ - shape: [58,], - target_location: ['critic_body', 'extra_inputs'] - }, + # 'greedy_action':{ + # #shape: [23], + # shape: [108], + # #shape: [43], + # #shape: [58,], + # target_location: ['critic_body', 'extra_inputs'] + # }, ######################## ######################## 'legal_actions':{ diff --git a/benchmark/R2D2/CoMaze/comaze_r2d2_sad_vdn_benchmark_config.yaml b/benchmark/R2D2/CoMaze/comaze_r2d2_sad_vdn_benchmark_config.yaml new file mode 100644 index 00000000..e8e0eba8 --- /dev/null +++ b/benchmark/R2D2/CoMaze/comaze_r2d2_sad_vdn_benchmark_config.yaml @@ -0,0 +1,715 @@ +extra_hyperparameters: &extra_hyperparameters + lr_account_for_nbr_actor: False + weights_decay_lambda: 0.0 + weights_entropy_lambda: 0.0 #01 + use_target_to_gather_data: False + + #################################### + # New hyperparameters: + PER_compute_initial_priority: False + ##################################### + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: True + sequence_replay_unroll_length: 80 + sequence_replay_overlap_length: 40 + sequence_replay_burn_in_length: 20 + + sequence_replay_PER_eta: 0.9 + + vdn: False + vdn_nbr_players: 2 + +LargeCNNMLP: &LargeCNNMLP + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_kernels: [8, 4, 3] + phi_arch_strides: [4, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + },'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + +LargeCNNMLP_SAD: &LargeCNNMLP_SAD + sad: True + + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_channels: [32, 64, 64] + #phi_arch_kernels: [8, 4, 3] + phi_arch_kernels: [3, 3, 3] + #phi_arch_strides: [4, 2, 1] + phi_arch_strides: [2, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + + +r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP + <<: *extra_hyperparameters + +r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP_SAD + <<: *extra_hyperparameters + + +experiment: + tasks: [{ + #'env-id': 'CoMaze-7x7-Dense-v0', + #'env-id': 'CoMaze-7x7-Dense-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-EasySecrets-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-HardSecrets-v0', + 'env-id': 'CoMaze-9x9-Dense-Level5-UniformSecrets-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-FixedSecretGoalRules-v0', + #'env-id': 'CoMaze-7x7-Dense-FixedActions-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayer-v0', + #'env-id': 'CoMaze-7x7-Dense-Easy-SinglePlayer-v0', + + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs21_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # More actors? fails to learn: + # maybe the trajectories are being replaced too fast in the replay-buffer: + # the replay occurrence for each trajectory is reduced... + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? /4 ==> 4*replay_ratio : great data-efficiency!!! + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + + #VDN: FixedAction + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23-debug1VDNLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/400MaxSteps/NOSAD/debugVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # NOOP: + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv100_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + # OP: + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP-VocabOnly/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + # + 'run-id': '100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-Vocab20/venv64', + + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/400MaxSteps/NOSAD/debugNOVDNLoss+NOSumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? ? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Observation space? Visibly better! + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/SeedRep1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/testRecording/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # Reducing nbr actor to increase experience replay time: /8 ==> replay-occurences*8? + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv8_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is the overlap useful? no + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is a different sequence length better? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # More exploration? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Bigger batch-size? + # Different explo: not sufficient, but at least there is maxtrainingreward remains high... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Different archi with relu after rnns? not clear yet whether archi or relu... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # The above arch is definitively learning as there are picks of in mean episode length occuring. + # Let us try to make it learn faster then: increasing lr and decreasing eps: + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # What about only decreasing eps: the most important element so far!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but even lower eps: huge gains!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Increasing nstep return? does not show specific improvement... + #'agent-id': '7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # burn-in? very sharp progress, but requires more update (since some values are not used in the loss...) + # early catastrophic forgetting phenomenon...! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone', + + # VDN: + # normal arch: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with entropy weight 0.1: + # Stable learning ! but cannot learn to communicate... + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + agent-id observation: + #'agent-id': '3step_VDN_1m1Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Trying to learn to communicate by enlarging the seq len: + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # SAD only: + #'agent-id': '3step_SAD_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 100: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 20: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 2: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + res con : + #'agent-id': '3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Biasing messes up with action policy entropy: + 'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Thus, the following increase in the lambda value aims to address this: + #'agent-id': '3step_SAD_VDN_aID_1m0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #SAD+VDN: plateauing at 8 in singleplayerreward + #'agent-id': '3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # + agent id: does not provide any help compared to SAD: + #'agent-id': '3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 4: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM4_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau1m5_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # New archi with extra input via fcn and more fc layers: higher pick than normal arch, but lr is too high visibly, the loss diverges... + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with low eps and low lr, new arch: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #archi 3: very small eps, minimal fcn + extra input on fcn: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # Normal archi but with a relu: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with a relu + only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'nbr_actor': 128, + #'nbr_actor': 100, + 'nbr_actor': 64, + #'nbr_actor': 32, + #'nbr_actor': 16, + #'nbr_actor': 8, + #'nbr_frame_skipping': 4, + #'nbr_frame_stacking': 4, + #'grayscale': True, + #'single_life_episode': True, #False, + #'nbr_max_random_steps': 30, + #'sad': False, + 'sad': True, + #'vdn': False, + 'vdn': True, + #"otherplay": True, + "otherplay": False, + 'clip_reward': False, + 'previous_reward_action': True, + #'observation_resize_dim': (21,21), #(56,56), + 'observation_resize_dim': 56, #(56,56), + # + 'reload': 'None', + # NOOP: + #"reload": "/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/TRAINING/PUBSUB/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent", + # OP: do be deprecated... + #'reload': '/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/PUBSUBDEBUG/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/debugPosList/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/TEST+DEBUGExpSumRLPolicy/NoMessEntropy+CorrectPosTargetEntropy+Masking+MeanEntReg/CorrectRuleBasedAgentInnerStateManag/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/GoalOrderingPred-NoDropout+RulesPrediction+BigArch/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent', + }, + ] + #experiment_id: 'r2d2_comaze_debug2' + experiment_id: 'r2d2_comaze_data' + benchmarking_episodes: 10 + benchmarking_interval: 1.0e3 + benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20 + #benchmarking_record_episode_interval: 1.0e20 + #train_observation_budget: 1.0e7 + train_observation_budget: 3.0e6 + seed: 1 + +agents: + SAD_IQL_paper_1step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate100Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m1_r2p4Min1e4_alpha9m1_beta6m1_over2e4_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0: + <<: *r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 100 #considering only 1 actor's steps. + + batch_size: 128 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 0.5 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 2e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 1e4 #in terms of experiences... #1e4 + + PER_beta_increase_interval: 2e4 + + double: True + dueling: True + noisy: False + + n_step: 1 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 2 + sequence_replay_overlap_length: 1 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.4 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + <<: *r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 1 #considering only 1 actor's steps. + + vdn: True + vdn_nbr_players: 2 + + batch_size: 32 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 5.0 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 2e4 #in terms of experiences... #1e4 + + PER_compute_initial_priority: False + PER_beta_increase_interval: None #2e5 + + double: True + dueling: True + noisy: False + n_step: 3 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: False + sequence_replay_unroll_length: 100 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.05 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + # New archi with relu on rnn: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with higher lr and lower eps for adam opt: + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # only lower eps: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # even lower eps: best version so far! BEST: + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #burn-in? + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone: + # increase nstep return? + #7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # VDN: + #3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2 + res. con.: + #3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # replay buffer 5p4 -> 1p5 + 3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # seq 100 + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # arch 3 with very low eps: minimal arch + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # New architecture with more FC layers and extra inputs are feed to those (rather than rnn): + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with low eps and low lr: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + only one rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with only on rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + # DEBUG: + #min_capacity: 1e3 + + #weights_entropy_lambda: 0.0 + #weights_entropy_lambda: 1.0 + weights_entropy_lambda: 0.001 #01 + + #vdn: False + vdn: True + vdn_nbr_players: 2 + #sad: False + sad: True + + + learning_rate: 6.25e-5 + #adam_eps: 1.5e-5 + #learning_rate: 1.0e-3 + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + #adam_eps: 1.0e-15 + + #replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + replay_capacity: 1e5 + min_capacity: 3e4 #in terms of experiences... #1e4 + + n_step: 3 + #n_step: 7 + + #tau: 4.0e-4 + #tau: 1.0e-5 + + #sequence_replay_overlap_length: 0 + #sequence_replay_overlap_length: 50 + + batch_size: 128 + + burn_in: False + #burn_in: True + + sequence_replay_unroll_length: 20 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + # #sequence_replay_burn_in_length: 10 + + # sequence_replay_unroll_length: 100 + # sequence_replay_overlap_length: 50 + # sequence_replay_burn_in_length: 0 + + epsend: 0.4 + eps_greedy_alpha: 2.0 + + # Architecture: + #critic_arch: 'LSTM-RNN' + #critic_arch_hidden_units: [512, 512] + #critic_arch_hidden_units: [512] + #use_relu_after_rnn: False + + # normal arch: + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] + + # Arch2: + critic_arch: 'MLP-LSTM-RNN2' + use_relu_after_rnn: False #True + use_residual_connection: True + critic_arch_linear_hidden_units: [512, 256] + critic_arch_feature_dim: 128 + critic_arch_hidden_units: [128, 128] + + # Arch 3: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [128] + # critic_arch_feature_dim: 64 + # critic_arch_hidden_units: [64] + + #Arch 4: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_hidden_units: [256] + # critic_arch_linear_post_hidden_units: [256] + # critic_arch_feature_dim: 128 + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + #shape: [4], + shape: [21], + #shape: [8], + #shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH AGENT_ID: + ######################## + 'agent_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + #shape: [23], + shape: [108], + #shape: [43], + #shape: [58,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + + + + diff --git a/benchmark/R2D2/CoMaze/comaze_test_r2d2_sad_vdn_benchmark_config.yaml b/benchmark/R2D2/CoMaze/comaze_test_r2d2_sad_vdn_benchmark_config.yaml new file mode 100644 index 00000000..7ac51f67 --- /dev/null +++ b/benchmark/R2D2/CoMaze/comaze_test_r2d2_sad_vdn_benchmark_config.yaml @@ -0,0 +1,714 @@ +extra_hyperparameters: &extra_hyperparameters + lr_account_for_nbr_actor: False + weights_decay_lambda: 0.0 + weights_entropy_lambda: 0.0 #01 + use_target_to_gather_data: False + + #################################### + # New hyperparameters: + PER_compute_initial_priority: False + ##################################### + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: True + sequence_replay_unroll_length: 80 + sequence_replay_overlap_length: 40 + sequence_replay_burn_in_length: 20 + + sequence_replay_PER_eta: 0.9 + + vdn: False + vdn_nbr_players: 2 + +LargeCNNMLP: &LargeCNNMLP + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_kernels: [8, 4, 3] + phi_arch_strides: [4, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + },'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + +LargeCNNMLP_SAD: &LargeCNNMLP_SAD + sad: True + + phi_arch: 'CNN' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + phi_arch_channels: [32, 64, 64] + #phi_arch_kernels: [8, 4, 3] + phi_arch_kernels: [3, 3, 3] + #phi_arch_strides: [4, 2, 1] + phi_arch_strides: [2, 2, 1] + phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 512 + phi_arch_hidden_units: [] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [512, 512] + + +r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP + <<: *extra_hyperparameters + +r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeCNNMLP_SAD + <<: *extra_hyperparameters + + +experiment: + tasks: [{ + #'env-id': 'CoMaze-7x7-Dense-v0', + #'env-id': 'CoMaze-7x7-Dense-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-EasySecrets-v0', + #'env-id': 'CoMaze-9x9-Dense-Level5-HardSecrets-v0', + 'env-id': 'CoMaze-9x9-Dense-Level5-UniformSecrets-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-FixedSecretGoalRules-v0', + #'env-id': 'CoMaze-7x7-Dense-FixedActions-v0', + #'env-id': 'CoMaze-7x7-Dense-SinglePlayer-v0', + #'env-id': 'CoMaze-7x7-Dense-Easy-SinglePlayer-v0', + + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs21_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # More actors? fails to learn: + # maybe the trajectories are being replaced too fast in the replay-buffer: + # the replay occurrence for each trajectory is reduced... + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? /4 ==> 4*replay_ratio : great data-efficiency!!! + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + + #VDN: FixedAction + # What about 64 with fixed actions (multiplayer)? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23-debug1VDNLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + #'run-id': 'serial/selfplay/400MaxSteps/NOSAD/debugVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # NOOP: + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv100_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + # OP: + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP-VocabOnly/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0', + # + #'run-id': '100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-Vocab20/venv64', + # Test: + 'run-id': '100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-Vocab20/venv100', + + + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/400MaxSteps/NOSAD/debugNOVDNLoss+NOSumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # What about 16 actors then? ? + #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + # Observation space? Visibly better! + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/SeedRep1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'run-id': 'serial/selfplay/testRecording/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + # Reducing nbr actor to increase experience replay time: /8 ==> replay-occurences*8? + #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv8_r2d2_Obs56_EntropyReg0_WeightDecayReg0/', + + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is the overlap useful? no + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Is a different sequence length better? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # More exploration? + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Bigger batch-size? + # Different explo: not sufficient, but at least there is maxtrainingreward remains high... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Different archi with relu after rnns? not clear yet whether archi or relu... + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # The above arch is definitively learning as there are picks of in mean episode length occuring. + # Let us try to make it learn faster then: increasing lr and decreasing eps: + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # What about only decreasing eps: the most important element so far!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but even lower eps: huge gains!!!!! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # Increasing nstep return? does not show specific improvement... + #'agent-id': '7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # burn-in? very sharp progress, but requires more update (since some values are not used in the loss...) + # early catastrophic forgetting phenomenon...! + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone', + + # VDN: + # normal arch: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with entropy weight 0.1: + # Stable learning ! but cannot learn to communicate... + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + agent-id observation: + #'agent-id': '3step_VDN_1m1Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # Trying to learn to communicate by enlarging the seq len: + #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # SAD only: + #'agent-id': '3step_SAD_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 100: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # seq 20: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 2: + #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # + res con : + #'agent-id': '3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + 'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #SAD+VDN: plateauing at 8 in singleplayerreward + #'agent-id': '3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + # + agent id: does not provide any help compared to SAD: + #'agent-id': '3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # arch 4: + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM4_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau1m5_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # New archi with extra input via fcn and more fc layers: higher pick than normal arch, but lr is too high visibly, the loss diverges... + #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # with low eps and low lr, new arch: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #archi 3: very small eps, minimal fcn + extra input on fcn: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + + # Normal archi but with a relu: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with a relu + only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + # normal archi but with only 1 rnn layer: + #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'nbr_actor': 128, + 'nbr_actor': 100, + #'nbr_actor': 64, + #'nbr_actor': 32, + #'nbr_actor': 16, + #'nbr_actor': 8, + #'nbr_frame_skipping': 4, + #'nbr_frame_stacking': 4, + #'grayscale': True, + #'single_life_episode': True, #False, + #'nbr_max_random_steps': 30, + #'sad': False, + 'sad': True, + #'vdn': False, + 'vdn': True, + #"otherplay": True, + "otherplay": False, + 'clip_reward': False, + 'previous_reward_action': True, + #'observation_resize_dim': (21,21), #(56,56), + 'observation_resize_dim': 56, #(56,56), + # + 'reload': 'None', + # NOOP: + #"reload": "/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/TRAINING/PUBSUB/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent", + # OP: do be deprecated... + #'reload': '/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/PUBSUBDEBUG/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/debugPosList/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/TEST+DEBUGExpSumRLPolicy/NoMessEntropy+CorrectPosTargetEntropy+Masking+MeanEntReg/CorrectRuleBasedAgentInnerStateManag/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/GoalOrderingPred-NoDropout+RulesPrediction+BigArch/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent', + }, + ] + #experiment_id: 'r2d2_comaze_debug2' + experiment_id: 'r2d2_comaze_data' + benchmarking_episodes: 100 + benchmarking_interval: 1.0e3 + benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20 + #benchmarking_record_episode_interval: 1.0e20 + #train_observation_budget: 1.0e7 + train_observation_budget: 3.3e6 + seed: 1 + +agents: + SAD_IQL_paper_1step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate100Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m1_r2p4Min1e4_alpha9m1_beta6m1_over2e4_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0: + <<: *r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 100 #considering only 1 actor's steps. + + batch_size: 128 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 0.5 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 2e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 1e4 #in terms of experiences... #1e4 + + PER_beta_increase_interval: 2e4 + + double: True + dueling: True + noisy: False + + n_step: 1 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 2 + sequence_replay_overlap_length: 1 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.4 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + <<: *r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 1 #considering only 1 actor's steps. + + vdn: True + vdn_nbr_players: 2 + + batch_size: 32 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 5.0 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 2e4 #in terms of experiences... #1e4 + + PER_compute_initial_priority: False + PER_beta_increase_interval: None #2e5 + + double: True + dueling: True + noisy: False + n_step: 3 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: False + sequence_replay_unroll_length: 100 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.05 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + # New archi with relu on rnn: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with higher lr and lower eps for adam opt: + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # only lower eps: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # even lower eps: best version so far! BEST: + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #burn-in? + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone: + # increase nstep return? + #7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # VDN: + #3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # With agent id: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2: + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # arch 2 + res. con.: + #3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # replay buffer 5p4 -> 1p5 + 3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # seq 100 + #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + #3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # arch 3 with very low eps: minimal arch + #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + + # New architecture with more FC layers and extra inputs are feed to those (rather than rnn): + #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # with low eps and low lr: + #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with relu on rnn + only one rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + # old archi with only on rnn layer: + #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + # DEBUG: + #min_capacity: 1e3 + + #weights_entropy_lambda: 0.0 + #weights_entropy_lambda: 0.1 + weights_entropy_lambda: 0.001 #01 + + #vdn: False + vdn: True + vdn_nbr_players: 2 + #sad: False + sad: True + + + learning_rate: 6.25e-5 + #adam_eps: 1.5e-5 + #learning_rate: 1.0e-3 + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + #adam_eps: 1.0e-15 + + #replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + replay_capacity: 1e5 + min_capacity: 3e4 #in terms of experiences... #1e4 + + n_step: 3 + #n_step: 7 + + #tau: 4.0e-4 + #tau: 1.0e-5 + + #sequence_replay_overlap_length: 0 + #sequence_replay_overlap_length: 50 + + batch_size: 128 + + burn_in: False + #burn_in: True + + sequence_replay_unroll_length: 20 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + # #sequence_replay_burn_in_length: 10 + + # sequence_replay_unroll_length: 100 + # sequence_replay_overlap_length: 50 + # sequence_replay_burn_in_length: 0 + + epsend: 0.4 + eps_greedy_alpha: 2.0 + + # Architecture: + #critic_arch: 'LSTM-RNN' + #critic_arch_hidden_units: [512, 512] + #critic_arch_hidden_units: [512] + #use_relu_after_rnn: False + + # normal arch: + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] + + # Arch2: + critic_arch: 'MLP-LSTM-RNN2' + use_relu_after_rnn: False #True + use_residual_connection: True + critic_arch_linear_hidden_units: [512, 256] + critic_arch_feature_dim: 128 + critic_arch_hidden_units: [128, 128] + + # Arch 3: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [128] + # critic_arch_feature_dim: 64 + # critic_arch_hidden_units: [64] + + #Arch 4: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_hidden_units: [256] + # critic_arch_linear_post_hidden_units: [256] + # critic_arch_feature_dim: 128 + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'communication_channel':{ + #shape: [4], + shape: [21], + #shape: [8], + #shape: [11,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'secret_goal_rule':{ + shape: [8,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + # WITH AGENT_ID: + ######################## + 'agent_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + #shape: [23], + shape: [108], + #shape: [43], + #shape: [58,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + + + + diff --git a/benchmark/R2D2/CoMaze/cross_play_evaluation.py b/benchmark/R2D2/CoMaze/cross_play_evaluation.py new file mode 100644 index 00000000..6d681c37 --- /dev/null +++ b/benchmark/R2D2/CoMaze/cross_play_evaluation.py @@ -0,0 +1,314 @@ +from typing import List, Dict, Tuple, Union, Optional +import pickle +from functools import partial +from itertools import product +import os +import argparse +import logging +import yaml + +from tqdm import tqdm +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns +import logging +import numpy as np +import gym + +import comaze_gym +from comaze_gym.utils.wrappers import comaze_wrap + +import regym +from regym.environments import generate_task, EnvType +from regym.util.wrappers import ClipRewardEnv, PreviousRewardActionInfoMultiAgentWrapper +from regym.rl_algorithms import build_Random_Agent + + +DESCRIPTION = \ +''' +This script performs "cross-play", as specified in the paper +"'Other-Play' for Zero-Shot Coordination.", by Hengyuan et al. + +Cross-play is a "cheap proxy to evaluate whether a training method has +potential for zero-shot coordination with human players". +''' + +from regym.rl_loops.multiagent_loops.marl_loop import test_agent + + +def cross_play(population: List['Agent'], + task: 'Task', + num_games_per_matchup: int, + num_matrices: List[int], + save_path: str=None, + show_progress: bool=True) \ + -> Tuple[np.ndarray, np.ndarray, float, float]: + ''' + Cross-play is a "cheap proxy to evaluate whether a training method has + potential for zero-shot coordination with human players". it plays every + agent in :param: population against each other (including themselves), + computing a matrix of average joint/pairwise performance for each pair of agents. + The average value in this cross-play matrix is known as the cross-play value. + + A high average cross-play value, indicates that agents in the population are + perform well when matched against other agents which are not themselves + (i.e, they can coordinate with other agents). + + For each pair of agents, pairwise performance is computed over :param: + num_games_per_matchup. A total of :param: num_matrices cross-play evaluation + matrices are computed and averaged over. + + :param population: List of agents which will be benchmarked + :param task: Task whose underlying environment will be used to test + :param: population on + :param num_games_per_matchup: Number of episodes to be played by each pair + of agents in :param: population to obtain + their joint performance value. + :param num_matrices: Number of cross-play matrices to compute, useful to + obtain variance estimations on cross-play values + :param save_path: If present, the mean resulting cross-play matrix will be + pickled and dumped in a file under this path + :param show_progress: Whether to output progression bars to stdout to indicate + cross-play computation progress + :returns: This function returns 4 values: + - A mean cross play matrix, containing pairwise + agent performances computed over :param: num_games_per_matchup and + averaged over :param: num_matrices. + - A matrix containing the element-wise standard deviation of each + pairwise performance over :param: num_matrices + - Average cross-play value of mean cross-play matrix + - Average standard deviation over mean cross-play matrix + ''' + cross_play_matrices = compute_cross_play_matrices( + num_matrices, population, task, num_games_per_matchup, + show_progress) + + mean_cross_play_value = np.mean(cross_play_matrices) + std_cross_play_value = np.std(cross_play_matrices) + mean_cross_play_matrix = np.mean(cross_play_matrices, axis=0) + std_cross_play_matrix = np.std(cross_play_matrices, axis=0) + + if save_path: pickle.dump(mean_cross_play_matrix, open(save_path, 'wb')) + + return (mean_cross_play_matrix, std_cross_play_matrix, + mean_cross_play_value, std_cross_play_value) + + +def compute_cross_play_matrices(num_matrices: int, + population:Dict[str,regym.rl_algorithms.agents.agent.Agent], + task: 'Task', + num_games_per_matchup: int, + show_progress: bool) -> List[np.ndarray]: + ''' + Computes a list, of length :param: num_matrices, of cross-play matrices + on :param: task. Each cross-play matrix is of shape NxN, + where `n = len(population)`. For each cross-play matrix, each entry + is computed using :param: num_games_per_matchup number of episodes. + ''' + cross_play_matrices: List[np.ndarray] = [] + iterator = range(num_matrices) + if show_progress: + description = (f'Computing {num_matrices} cross-play matrices ' + f'for {len(population)} agents on task {task.name} ' + f'with {num_games_per_matchup} games per matchup.') + iterator = tqdm(iterator, desc=description) + iterator.set_description(description) + for s in iterator: + cross_play_matrix = compute_cross_play_evaluation_matrix( + population=population, + task=task, + num_games_per_matchup=num_games_per_matchup, + show_progress=show_progress + ) + cross_play_matrices.append(cross_play_matrix) + return cross_play_matrices + + +def compute_cross_play_evaluation_matrix(population:Dict[str,regym.rl_algorithms.agents.agent.Agent], + task: 'Task', + num_games_per_matchup: int, + show_progress: bool=True) -> np.ndarray: + ''' + Computes a cross-play matrix of shape NxN, where `n = len(population)`. + Entry (i,j) represents the average performance of agents + (population[i], population[j]) on :param: task over :param: num_games_per_matchup. + ''' + cross_play_matrix = np.zeros((len(population), len(population))) + agentIndices2Name = dict(zip(range(len(population)), population.keys())) + matchups_agent_indices = list(product(range(len(population)), range(len(population)))) + if show_progress: + description = ('Computing cross play matrix with ' + f'{len(matchups_agent_indices)} pairwise combinations. ' + f'with {num_games_per_matchup} num games per matchup') + matchups_agent_indices = tqdm(matchups_agent_indices, desc=description) + for i, j in matchups_agent_indices: + i_name = agentIndices2Name[i] + j_name = agentIndices2Name[j] + pairwise_performance = compute_pairwise_performance( + agent_vector=[population[i_name], population[j_name]], + task=task, + num_episodes=num_games_per_matchup + ) + cross_play_matrix[i, j] = pairwise_performance + return cross_play_matrix + + +def compute_pairwise_performance(agent_vector: List[regym.rl_algorithms.agents.agent.Agent], + task: 'Task', # TODO: change upstream + num_episodes: int) -> float: + ''' + Computes the average episode reward obtained by :param: agent_vector on + :param: task over :param: num_episodes + ''' + trajectory_metrics = test_agent( + env=task.env, agents=agent_vector, nbr_episode=num_episodes, + update_count=None, sum_writer=None, iteration=None, base_path=None, + requested_metrics=['mean_total_return'] + ) + return trajectory_metrics['mean_total_return'] + + +def check_input_validity(num_games_per_matchup, num_matrices): + if int(num_games_per_matchup) <= 0: + raise ValueError(f'CLI Argument "num_games_per_matchup" must be strictly positive (Given: {num_games_per_matchup})') + if int(num_matrices) <= 0: + raise ValueError(f'CLI Argument "num_games_per_matchup" must be strictly positive (Given: {num_matrices})') + + +def plot_cross_play_matrix( + population:Dict[str,regym.rl_algorithms.agents.agent.Agent], + cross_play_matrix: Union[List, np.ndarray], + cross_play_value_variance: Optional[float]=None, + show_annotations: bool=True, + cbar: bool=True, + ax: Optional[plt.Axes] = None, + )-> plt.Axes: + ''' + Plots the :param: cross_play_matrix on a heatmap. + + Red values mean < 50% winrates + Positive values are shown in blue. + If :param: ax is not present + We'll create one for you <3 + + :param cross_play_matrix: Winrate matrix to plot. Values must be within [0, 1] + :param ax: Ax where the plot should be plotted. Optional + :show annotations: Flag determining whether values inside of the heatmap should be written + :returns: ax where the cross_play_matrix has been plotted + ''' + if not ax: ax = plt.subplot(111) + + sns.set(font_scale=2.5) + sns.heatmap(cross_play_matrix, annot=show_annotations, ax=ax, square=True, + cmap=sns.color_palette('viridis', 50), + cbar=cbar, cbar_kws={'label': 'Pairwise performance'}) + ax.set_xlabel('Agent ID', size=20) + ax.set_ylabel('Agent ID', size=20) + ax.set_ylim(len(cross_play_matrix) + 0.2, -0.2) # Required seaborn hack + + plt.xticks(np.arange(len(population)), list(population.keys()), rotation=45) + plt.yticks(np.arange(len(population)), list(population.keys()), rotation=45) + + title = f'Cross-play matrix. Cross-play value: {np.mean(cross_play_matrix)}' + if cross_play_value_variance: title = '{} +- {:.2}'.format(title, cross_play_value_variance) + + ax.set_title(title) + return ax + + +def create_task_for_r2d2(task_config): + def comaze_r2d2_wrap( + env, + clip_reward=False, + previous_reward_action=True + ): + env = comaze_wrap(env) + + if clip_reward: + env = ClipRewardEnv(env) + + if previous_reward_action: + env = PreviousRewardActionInfoMultiAgentWrapper(env=env) + return env + + pixel_wrapping_fn = partial( + comaze_r2d2_wrap, + clip_reward=task_config['clip_reward'], + previous_reward_action=task_config.get('previous_reward_action', False) + ) + test_pixel_wrapping_fn = pixel_wrapping_fn + task = generate_task(task_config['env-id'], + env_type=EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, + nbr_parallel_env=task_config['nbr_actor'], + wrapping_fn=pixel_wrapping_fn, + test_wrapping_fn=test_pixel_wrapping_fn, + gathering=False + ) + return task + +def load_agents(agents_dict:Dict[str,str])->Dict[str,regym.rl_algorithms.agents.agent.Agent]: + ''' + For rule-based agents, the paths are replaced by int to be used as seeds. + Player indices have to be set again upon matchup pairings. + ''' + for agent_name in agents_dict: + if 'RB' in agent_name: + import importlib + comaze_gym = importlib.import_module("regym.environments.envs.CoMaze.comaze-gym.comaze_gym") + from comaze_gym import build_WrappedActionOnlyRuleBasedAgent, build_WrappedCommunicatingRuleBasedAgent + build_fn = build_WrappedActionOnlyRuleBasedAgent + if 'comm' in agent_name: + build_fn = build_WrappedCommunicatingRuleBasedAgent + agents_dict[agent_name] = build_fn( + player_idx=1, + action_space_dim=task.action_dim, + seed=int(agents_dict[agent_name]), + ) + else: + agents_dict[agent_name] = torch.load(agents_dict[agent_name]) + agents_dict[agent_name].training = False + return agents_dict + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument('--config', required=True, help='Path to file containing cross-play parameters') + args = parser.parse_args() + + cross_play_config = yaml.load(open(args.config)) + task_config = cross_play_config['task'] + + ''' + if not os.path.isdir(cross_play_config['population_path']): + raise ValueError(f"CLI Argument 'population_path' does not point to an existing directory (Given: {cross_play_config['population_path']})") + ''' + + task = create_task_for_r2d2(task_config) + + loaded_population = load_agents(cross_play_config['agents']) + + # Making sure that parameters for evaluation are sound + check_input_validity( + int(cross_play_config['num_games_per_matchup']), + int(cross_play_config['num_matrices']) + ) + + (mean_cross_play_matrix, std_cross_play_matrix, + mean_cross_play_value, std_cross_play_value) = cross_play( + loaded_population, + task, + cross_play_config['num_games_per_matchup'], + cross_play_config['num_matrices'], + save_path=cross_play_config['save_path'], + show_progress=True, + ) + + matplotlib.use('TkAgg') + plot_cross_play_matrix( + population=loaded_population, + cross_play_matrix=mean_cross_play_matrix, + cross_play_value_variance=std_cross_play_value + ) + plt.show() + + import ipdb; ipdb.set_trace() diff --git a/benchmark/R2D2/CoMaze/rl_action_policy.py b/benchmark/R2D2/CoMaze/rl_action_policy.py new file mode 100644 index 00000000..aef87973 --- /dev/null +++ b/benchmark/R2D2/CoMaze/rl_action_policy.py @@ -0,0 +1,83 @@ +from typing import List, Dict, Optional + +import torch + +from regym.rl_algorithms.agents.agent import Agent +from comaze_gym.metrics import ActionPolicy + +class RLActionPolicy(ActionPolicy): + def __init__( + self, + agent:Agent, + combined_action_space:bool = False): + """ + + :param combined_action_space: + If True, then the message and actions performed + by the current agent are treated as belonging to + the same OpenAI's Discrete action space of size + n= #messages * #actions. + Else, n = # actions : directional actions. + """ + super(RLActionPolicy, self).__init__( + model=agent + ) + self.combined_action_space = combined_action_space + + def clone(self, training=False): + return RLActionPolicy( + agent=self.model.clone(training=training), + combined_action_space=self.combined_action_space + ) + + def reset(self, batch_size:int, training:Optional[bool]=False): + self.model.set_nbr_actor(batch_size, vdn=False, training=training) + + def save_inner_state(self): + self.saved_inner_state = self.model.get_rnn_states() + + def restore_inner_state(self): + self.model.set_rnn_states(self.saved_inner_state) + + def get_nbr_actor(self): + return self.model.get_nbr_actor() + + def forward(self, x:object): + """ + :param x: + Object representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the model. + + Here, x:Dict containing the keys: + -'state': torch.Tensor containing the environment state. + -'infos': Dict containing the entry 'abstract_repr' that is + actually used by the :param model:RuleBasedAgentWrapper. + + :return log_a: + torch.Tensor of logits over actions + (as a Discrete OpenAI's action space). + + Here, depending on :attr combined_action_space:, + we either marginalized over possible messages or not. + """ + + #log_p_a = self.model.take_action(**x) + pred_dict = self.model.query_action(**x) + log_p_a = pred_dict['log_a'] + # batch_size x action_space_dim + + batch_size = log_p_a.shape[0] + action_space_dim = log_p_a.shape[-1] + + if self.combined_action_space: + return log_p_a + + # Otherwise, we sum over the messages dimension (excluding the NOOP action): + self.vocab_size = (action_space_dim-1)//5 + # There are 5 possible directional actions: + #log_p_a = log_p_a[...,:-1].reshape((batch_size, 5, self.vocab_size)).sum(dim=-1).log_softmax(dim=1) + log_p_a = log_p_a[...,:-1].reshape((batch_size, 5, self.vocab_size)).exp().sum(dim=-1).log_softmax(dim=1) + # batch_size x 5 + + return log_p_a diff --git a/benchmark/R2D2/CoMaze/rl_hiddenstate_policy.py b/benchmark/R2D2/CoMaze/rl_hiddenstate_policy.py new file mode 100644 index 00000000..abf9cfeb --- /dev/null +++ b/benchmark/R2D2/CoMaze/rl_hiddenstate_policy.py @@ -0,0 +1,162 @@ +from typing import List, Dict, Optional + +import torch + +from regym.rl_algorithms.agents.agent import Agent +from comaze_gym.metrics import MessagePolicy + +from regym.rl_algorithms.utils import copy_hdict + +def extract_subtrees( + in_dict: Dict, + node_id: str): + ''' + Extracts a copy of subtree whose root is named :param node_id: from :param in_dict:. + ''' + queue = [in_dict] + pointer = None + + subtrees = [] + + while len(queue): + pointer = queue.pop(0) + if not isinstance(pointer, dict): continue + for k in pointer.keys(): + if node_id==k: + subtrees.append( + copy_hdict(pointer[k]) + ) + else: + queue.append(pointer[k]) + + return subtrees + +class RLHiddenStatePolicy(MessagePolicy): + def __init__( + self, + agent:Agent, + augmented:bool=False): + """ + + """ + super(RLHiddenStatePolicy, self).__init__( + model=agent, + ) + self.player_idx = 0 + self.augmented = augmented + + self.secretgoalStr2id = {"RED":0, "YELLOW":1, "BLUE":2, "GREEN":3} + + + def get_hiddens(self, info=None, from_pred=None): + if from_pred is None: + rnn_states = self.model.get_rnn_states() + else: + rnn_states = from_pred['next_rnn_states'] + # Extract 'hidden''s list: + hiddens = extract_subtrees(in_dict=rnn_states, node_id='hidden') + # List[List[Tensor]] + + vdn = self.model.kwargs.get('vdn', False) + vdn_nbr_players = self.model.kwargs.get('vdn_nbr_players', 2) + + nbr_rnn_modules = len(hiddens[0]) + batch_size = hiddens[0][0].shape[0] + + mult = 0 + if vdn and batch_size!=1: + batch_size = batch_size // vdn_nbr_players + mult = self.player_idx + + hiddens = torch.stack( + [ + torch.cat( + [hiddens[0][part_id][mult*batch_size+actor_id].reshape(-1) for part_id in range(nbr_rnn_modules)], + dim=0, + ) + for actor_id in range(batch_size) + ], + dim=0, + ) + # batch_size x nbr_parts*hidden_dims + if self.augmented: + extras = [] + for actor_id in range(batch_size): + if info is not None: + abs_repr = info[actor_id]['abstract_repr'] + + reached_goals_str = abs_repr['reached_goals'] + rg_hs = torch.zeros((4*3)) + startidx = 0 + for goal_str in reached_goals_str: + rg_hs[startidx+self.secretgoalStr2id[goal_str]] = 1.0 + startidx += 4 + + player_id = abs_repr['player_id'] + secretGoalRule = abs_repr['secretGoalRule'][player_id] + sgr_hs = torch.zeros((4*2)) + startidx = 0 + sgr_hs[ startidx+self.secretgoalStr2id[secretGoalRule.earlierGoal.color] ] = 1.0 + startidx += 4 + sgr_hs[ startidx+self.secretgoalStr2id[secretGoalRule.laterGoal.color] ] = 1.0 + + extra = torch.cat([rg_hs, sgr_hs], dim=0).reshape((1,-1)) + else: + extra = torch.zeros((1,4*3+4*2)) + extras.append(extra) + extras = torch.cat(extras, dim=0).to(hiddens.device) + hiddens = torch.cat([hiddens, extras], dim=1) + # batch_size x (nbr_parts*hidden_dims + extra_dim) + + return hiddens + + def get_hidden_state_dim(self): + hiddens = self.get_hiddens() + return hiddens.shape[-1] + + def clone(self, training=False): + return RLHiddenStatePolicy( + agent=self.model.clone(training=training), + augmented=self.augmented, + ) + + def reset(self, batch_size:int, training:Optional[bool]=False): + self.model.set_nbr_actor(batch_size, vdn=False, training=training) + + def save_inner_state(self): + self.saved_inner_state = self.model.get_rnn_states() + + def restore_inner_state(self): + self.model.set_rnn_states(self.saved_inner_state) + + def get_nbr_actor(self): + return self.model.get_nbr_actor() + + def forward(self, x:object): + """ + :param x: + Object representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the model. + + Here, x:Dict containing the keys: + -'state': torch.Tensor containing the environment state. + -'infos': Dict containing the entry 'abstract_repr' that is + actually used by the :param model:RuleBasedAgentWrapper. + + :return log_m: + torch.Tensor of logits over messages + (as a Discrete OpenAI's action space). + + Here, depending on :attr combined_action_space:, + we either marginalized over possible actions or not. + """ + + #log_p_a = self.model.query_action(**x) + pred_dict = self.model.query_action(**x) + # batch_size x action_space_dim + + hiddens = self.get_hiddens(info=x.get('infos', None), from_pred=pred_dict) + # batch_size x nbr_parts*hidden_dims + extra_dim if self.augmented + + return hiddens diff --git a/benchmark/R2D2/CoMaze/rl_message_policy.py b/benchmark/R2D2/CoMaze/rl_message_policy.py new file mode 100644 index 00000000..0d14abd0 --- /dev/null +++ b/benchmark/R2D2/CoMaze/rl_message_policy.py @@ -0,0 +1,82 @@ +from typing import List, Dict, Optional + +import torch + +from regym.rl_algorithms.agents.agent import Agent +from comaze_gym.metrics import MessagePolicy + +class RLMessagePolicy(MessagePolicy): + def __init__( + self, + agent:Agent, + combined_action_space:bool = False): + """ + + :param combined_action_space: + If True, then the message and actions performed + by the current agent are treated as belonging to + the same OpenAI's Discrete action space of size + n= #messages * #actions. + Else, n = # actions : directional actions. + """ + super(RLMessagePolicy, self).__init__( + model=agent + ) + self.combined_action_space = combined_action_space + + def clone(self, training=False): + return RLMessagePolicy( + agent=self.model.clone(training=training), + combined_action_space=self.combined_action_space + ) + + def reset(self, batch_size:int, training:Optional[bool]=False): + self.model.set_nbr_actor(batch_size, vdn=False, training=training) + + def save_inner_state(self): + self.saved_inner_state = self.model.get_rnn_states() + + def restore_inner_state(self): + self.model.set_rnn_states(self.saved_inner_state) + + def get_nbr_actor(self): + return self.model.get_nbr_actor() + + def forward(self, x:object): + """ + :param x: + Object representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the model. + + Here, x:Dict containing the keys: + -'state': torch.Tensor containing the environment state. + -'infos': Dict containing the entry 'abstract_repr' that is + actually used by the :param model:RuleBasedAgentWrapper. + + :return log_m: + torch.Tensor of logits over messages + (as a Discrete OpenAI's action space). + + Here, depending on :attr combined_action_space:, + we either marginalized over possible actions or not. + """ + + #log_p_a = self.model.take_action(**x) + pred_dict = self.model.query_action(**x) + log_p_a = pred_dict['log_a'] + # batch_size x action_space_dim + + batch_size = log_p_a.shape[0] + action_space_dim = log_p_a.shape[-1] + + if self.combined_action_space: + return log_p_a + + # Otherwise, we sum over the messages dimension (excluding the NOOP action): + self.vocab_size = (action_space_dim-1)//5 + # There are 5 possible directional actions: + log_p_m = log_p_a[...,:-1].reshape((batch_size, 5, self.vocab_size)).exp().sum(dim=1).log_softmax(dim=1) + # batch_size x vocab_size + + return log_p_m diff --git a/benchmark/R2D2/CoMaze/run.sh b/benchmark/R2D2/CoMaze/run.sh new file mode 100644 index 00000000..95055e86 --- /dev/null +++ b/benchmark/R2D2/CoMaze/run.sh @@ -0,0 +1,9 @@ +# There are 3 to 4 import_ipdb that will pop out just to sanity check that the argument are taken into account. +# Just enter 'c' to continue... +# If there is a break after the environments have been created, then it is an actual issue (unless have forgotten to remove a debugging set_trace(), sorry...) + +# With communicating rule-based agent: +python -m ipdb -c c ./benchmark_selfplay_comaze.py comaze_communicating_rule_based_benchmark_config.yaml --pubsub --communicating_rule_based --use_ms_cic + +# With action-only rule-based agent: +#python -m ipdb -c c ./benchmark_selfplay_comaze.py comaze_communicating_rule_based_benchmark_config.yaml --pubsub --rule_based --use_ms_cic \ No newline at end of file diff --git a/benchmark/R2D2/SymbolicBehaviourBenchmark/benchmark_selfplay_s2b.py b/benchmark/R2D2/SymbolicBehaviourBenchmark/benchmark_selfplay_s2b.py new file mode 100644 index 00000000..7912561c --- /dev/null +++ b/benchmark/R2D2/SymbolicBehaviourBenchmark/benchmark_selfplay_s2b.py @@ -0,0 +1,875 @@ +from typing import Dict, Any, Optional, List, Callable + +import torch +import sklearn + +import logging +import yaml +import os +import sys +from typing import Dict + +import torch.multiprocessing + +from tensorboardX import SummaryWriter +from tqdm import tqdm +from functools import partial + + +import torch +import numpy as np +import random + +import regym +from regym.environments import generate_task, EnvType +from regym.rl_loops.multiagent_loops import marl_loop +from regym.util.experiment_parsing import initialize_agents + +import symbolic_behaviour_benchmark +from symbolic_behaviour_benchmark.utils.wrappers import s2b_wrap +from symbolic_behaviour_benchmark.rule_based_agents import build_WrappedPositionallyDisentangledSpeakerAgent + +from regym.util.wrappers import ClipRewardEnv, PreviousRewardActionInfoMultiAgentWrapper + +import ray + +from regym.modules import EnvironmentModule, CurrentAgentsModule +from regym.modules import MARLEnvironmentModule, RLAgentModule + +from regym.modules import ReconstructionFromHiddenStateModule, MultiReconstructionFromHiddenStateModule +from rl_hiddenstate_policy import RLHiddenStatePolicy + +from regym.pubsub_manager import PubSubManager + +def make_rl_pubsubmanager( + agents, + config, + logger=None, + load_path=None, + save_path=None, + speaker_rec=False, + listener_rec=False, + listener_comm_rec=False, + ): + """ + Create a PubSubManager. + :param agents: List of Agents to use in the rl loop. + :param config: Dict that specifies all the important hyperparameters of the network. + - "task" + - "sad" + - "vdn" + - "otherplay" + - "max_obs_count" + - "sum_writer": str where to save the summary... + + """ + pipelined = False + use_multi_rec = False + if len(sys.argv) > 2: + pipelined = any(['pipelined' in arg for arg in sys.argv]) + if len(sys.argv) >2: + use_multi_rec = any(['multi_rec' in arg for arg in sys.argv]) + + modules = config.pop("modules") + + cam_id = "current_agents" + modules[cam_id] = CurrentAgentsModule( + id=cam_id, + agents=agents + ) + + if pipelined: + envm_id = "MARLEnvironmentModule_0" + envm_input_stream_ids = { + "iteration":"signals:iteration", + "current_agents":f"modules:{cam_id}:ref", + } + + rlam_ids = [ + f"rl_agent_{rlaidx}" + for rlaidx in range(len(agents)) + ] + for aidx, (rlam_id, agent) in enumerate(zip(rlam_ids, agents)): + rlam_config = { + 'agent': agent, + 'actions_stream_id':f"modules:{envm_id}:player_{aidx}:actions", + } + + envm_input_stream_ids[f'player_{aidx}'] = f"modules:{rlam_id}:ref" + + rlam_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "reset_actors":f"modules:{envm_id}:reset_actors", + + "observations":f"modules:{envm_id}:ref:player_{aidx}:observations", + "infos":f"modules:{envm_id}:ref:player_{aidx}:infos", + "actions":f"modules:{envm_id}:ref:player_{aidx}:actions", + "succ_observations":f"modules:{envm_id}:ref:player_{aidx}:succ_observations", + "succ_infos":f"modules:{envm_id}:ref:player_{aidx}:succ_infos", + "rewards":f"modules:{envm_id}:ref:player_{aidx}:rewards", + "dones":f"modules:{envm_id}:ref:player_{aidx}:dones", + } + modules[rlam_id] = RLAgentModule( + id=rlam_id, + config=rlam_config, + input_stream_ids=rlam_input_stream_ids, + ) + + modules[envm_id] = MARLEnvironmentModule( + id=envm_id, + config=config, + input_stream_ids=envm_input_stream_ids + ) + else: + envm_id = "EnvironmentModule_0" + envm_input_stream_ids = { + #"logger":"modules:logger:ref", + #"logs_dict":"logs_dict", + + "iteration":"signals:iteration", + + "current_agents":f"modules:{cam_id}:ref", + } + modules[envm_id] = EnvironmentModule( + id=envm_id, + config=config, + input_stream_ids=envm_input_stream_ids + ) + + def build_signal_to_reconstruct_from_trajectory_fn( + traj: List[List[Any]], + player_id:int, + ) -> List[torch.Tensor]: + labels = [] + for exp in traj[player_id]: + labels.append(torch.from_numpy((1+exp[0])*0.5)) + return labels + def build_comm_to_reconstruct_from_trajectory_fn( + traj: List[List[Any]], + player_id:int, + ) -> List[torch.Tensor]: + likelihoods = [] + previous_com = None + for exp in traj[player_id]: + current_com = torch.from_numpy(exp[-1]['communication_channel']) + if previous_com is None: previous_com = current_com + + target_pred = torch.cat([previous_com, current_com], dim=-1) + likelihoods.append(target_pred) + + previous_com = current_com + return likelihoods + + + rec_p0_id = "Reconstruction_player0" + rec_p0_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + speaker_rec_biasing = False + if len(sys.argv) > 2: + speaker_rec_biasing = any(['speaker_rec_biasing' in arg for arg in sys.argv[2:]]) + + if speaker_rec_biasing: + print("WARNING: Biasing for Speaker's Reconstruction.") + else: + print("WARNING: NOT biasing Speaker's Reconstruction.") + + rec_p0_config = { + "biasing":speaker_rec_biasing, + "nbr_players":len(agents), + "player_id":0, + 'use_cuda':True, + "signal_to_reconstruct_dim": 4*3, + "hiddenstate_policy": RLHiddenStatePolicy(agent=agents[0]), + "build_signal_to_reconstruct_from_trajectory_fn": build_signal_to_reconstruct_from_trajectory_fn, + } + + if speaker_rec and not(use_multi_rec): + modules[rec_p0_id] = ReconstructionFromHiddenStateModule( + id=rec_p0_id, + config=rec_p0_config, + input_stream_ids=rec_p0_input_stream_ids, + ) + + rec_p1_id = "Reconstruction_player1" + rec_p1_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + listener_rec_biasing = False + if len(sys.argv) > 2: + listener_rec_biasing = any(['listener_rec_biasing' in arg for arg in sys.argv[2:]]) + + if listener_rec_biasing: + print("WARNING: Biasing for Listener's Reconstruction.") + else: + print("WARNING: NOT biasing Listener's Reconstruction.") + + rec_p1_config = { + "biasing":listener_rec_biasing, + "nbr_players":len(agents), + "player_id":1, + 'use_cuda':True, + "signal_to_reconstruct_dim": 4*3, + "hiddenstate_policy": RLHiddenStatePolicy(agent=agents[-1]), + "build_signal_to_reconstruct_from_trajectory_fn": build_signal_to_reconstruct_from_trajectory_fn, + } + + if listener_rec and not(use_multi_rec): + modules[rec_p1_id] = ReconstructionFromHiddenStateModule( + id=rec_p1_id, + config=rec_p1_config, + input_stream_ids=rec_p1_input_stream_ids, + ) + + comm_rec_p1_id = "CommReconstruction_player1" + comm_rec_p1_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "trajectories":f"modules:{envm_id}:trajectories", + "filtering_signal":f"modules:{envm_id}:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + listener_comm_rec_biasing = False + if len(sys.argv) > 2: + listener_comm_rec_biasing = any(['listener_comm_rec_biasing' in arg for arg in sys.argv[2:]]) + + if listener_comm_rec_biasing: + print("WARNING: Biasing for Listener's Communication Reconstruction.") + else: + print("WARNING: NOT biasing Listener's Communication Reconstruction.") + + def comm_accuracy_pre_process_fn( + pred:torch.Tensor, + target:torch.Tensor, + ): + # Reshape into (sentence_length, vocab_size): + target = target.reshape(-1, 7) + pred = pred.reshape(-1, 7) + + # Retrieve target idx: + target_idx = target.max(dim=-1, keepdim=True)[1] + + pred_distr = pred.softmax(dim=-1) + + #acc = ((pred-0.1<=target).float()+(pred+0.1>=target).float())==2).gather( + ''' + acc = (pred_distr>=0.5).float().gather( + dim=-1, + index=target_idx, + ) + ''' + pred_idx = pred.max(dim=-1, keepdim=True)[1] + acc = (target_idx == pred_idx).float().reshape(1,-1) + # (1, sentence_length) + + return acc + + comm_rec_p1_config = { + "biasing":listener_comm_rec_biasing, + "nbr_players":len(agents), + "player_id":1, + 'use_cuda':True, + "signal_to_reconstruct_dim": 7*2, + "hiddenstate_policy": RLHiddenStatePolicy(agent=agents[-1]), + "build_signal_to_reconstruct_from_trajectory_fn": build_comm_to_reconstruct_from_trajectory_fn, + "accuracy_pre_process_fn":comm_accuracy_pre_process_fn, + } + + if listener_comm_rec and not(use_multi_rec): + modules[comm_rec_p1_id] = ReconstructionFromHiddenStateModule( + id=comm_rec_p1_id, + config=comm_rec_p1_config, + input_stream_ids=comm_rec_p1_input_stream_ids, + ) + + if use_multi_rec: + if speaker_rec: + raise NotImplementedError + multi_rec_p1_id = 'multi_rec_p1' + rec_dicts = {} + rec_p1_config = { + "signal_to_reconstruct_dim":4*3, + "build_signal_to_reconstruct_from_trajectory_fn":build_signal_to_reconstruct_from_trajectory_fn, + } + rec_dicts[rec_p1_id] = rec_p1_config + comm_rec_p1_config = { + "signal_to_reconstruct_dim": 7*2, + "build_signal_to_reconstruct_from_trajectory_fn": build_comm_to_reconstruct_from_trajectory_fn, + "accuracy_pre_process_fn":comm_accuracy_pre_process_fn, + } + rec_dicts[comm_rec_p1_id] = comm_rec_p1_config + modules[multi_rec_p1_id] = MultiReconstructionFromHiddenStateModule( + id=multi_rec_p1_id, + config={ + "biasing":listener_rec_biasing or listener_comm_rec_biasing, + "nbr_players":len(agents), + "player_id":1, + "use_cuda":True, + "hiddenstate_policy":RLHiddenStatePolicy(agent=agents[-1]), + "rec_dicts": rec_dicts, + }, + input_stream_ids=rec_p1_input_stream_ids, + ) + + + pipelines = config.pop("pipelines") + + pipelines["rl_loop_0"] = [ + envm_id, + ] + if pipelined: + for rlam_id in rlam_ids: + pipelines['rl_loop_0'].append(rlam_id) + if use_multi_rec and (listener_rec or listener_comm_rec): + pipelines["rl_loop_0"].append(multi_rec_p1_id) + else: + if speaker_rec: + pipelines["rl_loop_0"].append(rec_p0_id) + if listener_rec: + pipelines["rl_loop_0"].append(rec_p1_id) + if listener_comm_rec: + pipelines["rl_loop_0"].append(comm_rec_p1_id) + + + + optim_id = "global_optim" + optim_config = { + "modules":modules, + "learning_rate":3e-4, + "optimizer_type":'adam', + "with_gradient_clip":False, + "adam_eps":1e-16, + } + + optim_module = regym.modules.build_OptimizationModule( + id=optim_id, + config=optim_config, + ) + modules[optim_id] = optim_module + + logger_id = "per_epoch_logger" + logger_module = regym.modules.build_PerEpochLoggerModule(id=logger_id) + modules[logger_id] = logger_module + + pipelines[optim_id] = [] + pipelines[optim_id].append(optim_id) + pipelines[optim_id].append(logger_id) + + pbm = PubSubManager( + config=config, + modules=modules, + pipelines=pipelines, + logger=logger, + load_path=load_path, + save_path=save_path, + ) + + return pbm + + +def s2b_r2d2_wrap( + env, + clip_reward=False, + previous_reward_action=True, + otherplay=False + ): + env = s2b_wrap( + env, + combined_actions=True, + dict_obs_space=False, + ) + + if clip_reward: + env = ClipRewardEnv(env) + + if previous_reward_action: + env = PreviousRewardActionInfoMultiAgentWrapper(env=env) + + return env + + +def check_path_for_agent(filepath): + #filepath = os.path.join(path,filename) + agent = None + offset_episode_count = 0 + if os.path.isfile(filepath): + print('==> loading checkpoint {}'.format(filepath)) + agent = torch.load(filepath) + offset_episode_count = agent.episode_count + #setattr(agent, 'episode_count', offset_episode_count) + print('==> loaded checkpoint {}'.format(filepath)) + return agent, offset_episode_count + + +def train_and_evaluate(agents: List[object], + task: object, + sum_writer: object, + base_path: str, + offset_episode_count: int = 0, + nbr_pretraining_steps: int = 0, + nbr_max_observations: int = 1e7, + test_obs_interval: int = 1e4, + test_nbr_episode: int = 10, + benchmarking_record_episode_interval: int = None, + render_mode="rgb_array", + step_hooks=[], + sad=False, + vdn=False, + otherplay=False, + speaker_rec=False, + listener_rec=False, + listener_comm_rec=False, + ): + pubsub = False + if len(sys.argv) > 2: + pubsub = any(['pubsub' in arg for arg in sys.argv]) + + if pubsub: + config = { + "modules": {}, + "pipelines": {}, + } + + config['training'] = True + config['env_configs'] = None + config['task'] = task + + sum_writer_path = os.path.join(sum_writer, 'actor.log') + sum_writer = config['sum_writer'] = SummaryWriter(sum_writer_path, flush_secs=1) + + config['base_path'] = base_path + config['offset_episode_count'] = offset_episode_count + config['nbr_pretraining_steps'] = nbr_pretraining_steps + config['max_obs_count'] = nbr_max_observations + config['test_obs_interval'] = test_obs_interval + config['test_nbr_episode'] = test_nbr_episode + config['benchmarking_record_episode_interval'] = benchmarking_record_episode_interval + config['render_mode'] = render_mode + config['step_hooks'] = step_hooks + config['save_traj_length_divider'] =1 + config['sad'] = sad + config['vdn'] = vdn + config['otherplay'] = otherplay + config['nbr_players'] = 2 + pubsubmanager = make_rl_pubsubmanager( + agents=agents, + config=config, + speaker_rec=speaker_rec, + listener_rec=listener_rec, + listener_comm_rec=listener_comm_rec, + logger=sum_writer, + ) + + pubsubmanager.train() + + trained_agents = agents + else: + asynch = False + if len(sys.argv) > 2: + asynch = any(['async' in arg for arg in sys.argv]) + + if asynch: + trained_agent = marl_loop.async_gather_experience_parallel1( + #trained_agents = marl_loop.async_gather_experience_parallel( + task, + agents, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + sad=sad, + vdn=vdn, + otherplay=otherplay, + ) + else: + trained_agents = marl_loop.gather_experience_parallel( + task, + agents, + training=True, + #nbr_pretraining_steps=nbr_pretraining_steps, + max_obs_count=nbr_max_observations, + env_configs=None, + sum_writer=sum_writer, + base_path=base_path, + test_obs_interval=test_obs_interval, + test_nbr_episode=test_nbr_episode, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + save_traj_length_divider=1, + render_mode=render_mode, + step_hooks=step_hooks, + sad=sad, + vdn=vdn, + otherplay=otherplay + ) + + save_replay_buffer = False + if len(sys.argv) > 2: + save_replay_buffer = any(['save_replay_buffer' in arg for arg in sys.argv]) + + for agent in trained_agents: + agent.save(with_replay_buffer=save_replay_buffer) + print(f"Agent saved at: {agent.save_path}") + + task.env.close() + task.test_env.close() + + return trained_agents + + +def training_process(agent_config: Dict, + task_config: Dict, + benchmarking_interval: int = 1e4, + benchmarking_episodes: int = 10, + benchmarking_record_episode_interval: int = None, + train_observation_budget: int = 1e7, + base_path: str = './', + seed: int = 0): + + test_only = False + path_suffix = None + pubsub = False + speaker_rec = False + listener_rec = False + listener_comm_rec = False + speaker_rec_biasing = False + listener_rec_biasing = False + listener_comm_rec_biasing = False + + use_rule_based_agent = False + use_speaker_rule_based_agent = False + if len(sys.argv) > 2: + pubsub = any(['pubsub' in arg for arg in sys.argv]) + test_only = any(['test_only' in arg for arg in sys.argv]) + + speaker_rec = any(['use_speaker_rec' in arg for arg in sys.argv[2:]]) + listener_rec = any(['use_listener_rec' in arg for arg in sys.argv[2:]]) + listener_comm_rec = any(['use_listener_comm_rec' in arg for arg in sys.argv[2:]]) + + speaker_rec_biasing = any(['speaker_rec_biasing' in arg for arg in sys.argv[2:]]) + listener_rec_biasing = any(['listener_rec_biasing' in arg for arg in sys.argv[2:]]) + listener_comm_rec_biasing = any(['listener_comm_rec_biasing' in arg for arg in sys.argv[2:]]) + + use_rule_based_agent = any(['rule_based_agent' in arg for arg in sys.argv[2:]]) + use_speaker_rule_based_agent = any(['speaker_rule_based_agent' in arg for arg in sys.argv[2:]]) + if use_rule_based_agent: + agent_config['vdn'] = False + agent_config['sad'] = False + task_config['vdn'] = False + task_config['vdn'] = False + + override_seed_argv_idx = [idx for idx, arg in enumerate(sys.argv) if '--seed' in arg] + if len(override_seed_argv_idx): + seed = int(sys.argv[override_seed_argv_idx[0]+1]) + print(f"NEW RANDOM SEED: {seed}") + + override_reload_argv = [idx for idx, arg in enumerate(sys.argv) if '--reload_path' in arg] + if len(override_reload_argv): + task_config["reload"] = sys.argv[override_reload_argv[0]+1] + print(f"NEW RELOAD PATH: {task_config['reload']}") + + path_suffix_argv = [idx for idx, arg in enumerate(sys.argv) if '--path_suffix' in arg] + if len(path_suffix_argv): + path_suffix = sys.argv[path_suffix_argv[0]+1] + print(f"ADDITIONAL PATH SUFFIX: {path_suffix}") + + obs_budget_argv = [idx for idx, arg in enumerate(sys.argv) if '--obs_budget' in arg] + if len(obs_budget_argv): + train_observation_budget = int(sys.argv[obs_budget_argv[0]+1]) + print(f"TRAINING OBSERVATION BUDGET: {train_observation_budget}") + + + task_config["otherplay"] = any(['--otherplay' in arg for arg in sys.argv[2:]]) + + if test_only: + base_path = os.path.join(base_path,"TESTING") + else: + base_path = os.path.join(base_path,"TRAINING") + + if use_rule_based_agent: + base_path = os.path.join(base_path, f"WithPosDis{'Speaker' if use_speaker_rule_based_agent else 'Listener'}RBAgent") + + if pubsub: + base_path = os.path.join(base_path,"PUBSUB") + else: + base_path = os.path.join(base_path,"NOPUBSUB") + + if speaker_rec: + base_path = os.path.join(base_path,f"SpeakerReconstruction{'+Biasing-1p3' if speaker_rec_biasing else ''}-BigArch") + if listener_rec: + base_path = os.path.join(base_path,f"ListenerReconstruction{'+Biasing-1p0' if listener_rec_biasing else ''}-BigArch") + if listener_comm_rec: + base_path = os.path.join(base_path,f"ListenerCommunicationChannelReconstruction{'+Biasing-1p0' if listener_comm_rec_biasing else ''}-BigArch") + + + if task_config["otherplay"]: + base_path = os.path.join(base_path,"OtherPlay") + + base_path = os.path.join(base_path,f"SEED{seed}") + + if path_suffix is not None: + base_path = os.path.join(base_path, path_suffix) + + print(f"Final Path: -- {base_path} --") + import ipdb; ipdb.set_trace() + + if not os.path.exists(base_path): os.makedirs(base_path) + + task_config['final_path'] = base_path + task_config['command_line'] = ' '.join(sys.argv) + print(task_config['command_line']) + yaml.dump( + task_config, + open( + os.path.join(base_path, "task_config.yaml"), 'w', + encoding='utf8', + ), + ) + yaml.dump( + agent_config, + open( + os.path.join(base_path, "agent_config.yaml"), 'w', + encoding='utf8', + ), + ) + + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) + if hasattr(torch.backends, "cudnn"): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + pixel_wrapping_fn = partial( + s2b_r2d2_wrap, + clip_reward=task_config['clip_reward'], + previous_reward_action=task_config.get('previous_reward_action', False), + otherplay=task_config.get("otherplay", False), + ) + + test_pixel_wrapping_fn = pixel_wrapping_fn + """ + partial( + baseline_atari_pixelwrap, + size=task_config['observation_resize_dim'], + skip=task_config['nbr_frame_skipping'], + stack=task_config['nbr_frame_stacking'], + grayscale=task_config['grayscale'], + single_life_episode=False, + nbr_max_random_steps=task_config['nbr_max_random_steps'], + clip_reward=False, + previous_reward_action=task_config.get('previous_reward_action', False) + ) + """ + video_recording_dirpath = os.path.join(base_path,'videos') + video_recording_render_mode = 'human_comm' + task = generate_task(task_config['env-id'], + env_type=EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, + nbr_parallel_env=task_config['nbr_actor'], + wrapping_fn=pixel_wrapping_fn, + test_wrapping_fn=test_pixel_wrapping_fn, + env_config=task_config['env-config'], + test_env_config=task_config['env-config'], + seed=seed, + test_seed=100+seed, + gathering=True, + train_video_recording_episode_period=benchmarking_record_episode_interval, + train_video_recording_dirpath=video_recording_dirpath, + train_video_recording_render_mode=video_recording_render_mode, + ) + + agent_config['nbr_actor'] = task_config['nbr_actor'] + + regym.RegymSummaryWriterPath = base_path #regym.RegymSummaryWriter = GlobalSummaryWriter(base_path) + sum_writer = base_path + + save_path1 = os.path.join(base_path,f"./{task_config['agent-id']}.agent") + if task_config.get("reload", 'None')!='None': + agent, offset_episode_count = check_path_for_agent(task_config["reload"]) + else: + agent, offset_episode_count = check_path_for_agent(save_path1) + + if agent is None: + agent = initialize_agents( + task=task, + agent_configurations={task_config['agent-id']: agent_config} + )[0] + agent.save_path = save_path1 + + if test_only: + print(save_path1) + agent.training = False + + if "vdn" in agent_config \ + and agent_config["vdn"]: + agents = [agent] + else: + if use_rule_based_agent: + if use_speaker_rule_based_agent: + rb_agent = build_WrappedPositionallyDisentangledSpeakerAgent( + player_idx=0, + action_space_dim=task.env.action_space.n, + vocab_size=task.env.unwrapped_env.unwrapped.vocab_size, + max_sentence_length=task.env.unwrapped_env.unwrapped.max_sentence_length, + nbr_communication_rounds=task.env.unwrapped_env.unwrapped.nbr_communication_rounds, + nbr_latents=task.env.unwrapped_env.unwrapped.nbr_latents, + + ) + agents = [rb_agent, agent] + else: + rb_agent = build_WrappedPositionallyDisentangledListenerAgent( + player_idx=1, + action_space_dim=task.env.action_space.n, + vocab_size=task.env.unwrapped_env.unwrapped.vocab_size, + max_sentence_length=task.env.unwrapped_env.unwrapped.max_sentence_length, + nbr_communication_rounds=task.env.unwrapped_env.unwrapped.nbr_communication_rounds, + nbr_latents=task.env.unwrapped_env.unwrapped.nbr_latents, + + ) + agents = [agent, rb_agent] + else: + player2_harvest = False + + if len(sys.argv) > 2: + player2_harvest = any(['player2_harvest' in arg for arg in sys.argv]) + + agents = [agent, agent.get_async_actor(training=player2_harvest)] + # We can create non-training or training async actors. + # If traininging, then their experience is added to the replay buffer + # of the main agent, which might have some advantanges + # -given that it proposes decorrelated data-, but it may + # also have unknown disadvantages. Needs proper investigation. + + + trained_agents = train_and_evaluate( + agents=agents, + task=task, + sum_writer=sum_writer, + base_path=base_path, + offset_episode_count=offset_episode_count, + nbr_pretraining_steps=int(float(agent_config["nbr_pretraining_steps"])) if "nbr_pretraining_steps" in agent_config else 0, + nbr_max_observations=train_observation_budget, + test_obs_interval=benchmarking_interval, + test_nbr_episode=benchmarking_episodes, + #benchmarking_record_episode_interval=None, + benchmarking_record_episode_interval=benchmarking_record_episode_interval, + render_mode="human_comm", + sad=task_config["sad"], + vdn=task_config["vdn"], + otherplay=task_config.get("otherplay", False), + speaker_rec=speaker_rec, + listener_rec=listener_rec, + listener_comm_rec=listener_comm_rec, + ) + + return trained_agents, task + + +def load_configs(config_file_path: str): + all_configs = yaml.load(open(config_file_path)) + + agents_config = all_configs['agents'] + experiment_config = all_configs['experiment'] + envs_config = experiment_config['tasks'] + + return experiment_config, agents_config, envs_config + + +def main(): + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger('Symbolic Behaviour Benchmark') + + config_file_path = sys.argv[1] #'./atari_10M_benchmark_config.yaml' + experiment_config, agents_config, tasks_configs = load_configs(config_file_path) + + # Generate path for experiment + base_path = experiment_config['experiment_id'] + if not os.path.exists(base_path): os.makedirs(base_path) + + for task_config in tasks_configs: + agent_name = task_config['agent-id'] + env_name = task_config['env-id'] + run_name = task_config['run-id'] + path = f'{base_path}/{env_name}/{run_name}/{agent_name}' + print(f"Tentative Path: -- {path} --") + training_process(agents_config[task_config['agent-id']], task_config, + benchmarking_interval=int(float(experiment_config['benchmarking_interval'])), + benchmarking_episodes=int(float(experiment_config['benchmarking_episodes'])), + benchmarking_record_episode_interval=int(float(experiment_config['benchmarking_record_episode_interval'])) if experiment_config['benchmarking_record_episode_interval']!='None' else None, + train_observation_budget=int(float(experiment_config['train_observation_budget'])), + base_path=path, + seed=experiment_config['seed']) + +if __name__ == '__main__': + asynch = False + __spec__ = None + if len(sys.argv) > 2: + asynch = any(['async' in arg for arg in sys.argv]) + if asynch: + torch.multiprocessing.freeze_support() + torch.multiprocessing.set_start_method("forkserver", force=True) + #torch.multiprocessing.set_start_method("spawn", force=True) + ray.init() #local_mode=True) + + from regym import CustomManager as Manager + from multiprocessing.managers import SyncManager, MakeProxyType, public_methods + + # from regym.rl_algorithms.replay_buffers import SharedPrioritizedReplayStorage + # #SharedPrioritizedReplayStorageProxy = MakeProxyType("SharedPrioritizedReplayStorage", public_methods(SharedPrioritizedReplayStorage)) + # Manager.register("SharedPrioritizedReplayStorage", + # SharedPrioritizedReplayStorage,# SharedPrioritizedReplayStorageProxy) + # exposed=[ + # "get_beta", + # "get_tree_indices", + # "cat", + # "reset", + # "add_key", + # "total", + # "__len__", + # "priority", + # "sequence_priority", + # "update", + # "add", + # "sample", + # ] + # ) + # print("WARNING: SharedPrioritizedReplayStorage class has been registered with the RegymManager.") + + regym.RegymManager = Manager() + regym.RegymManager.start() + + main() diff --git a/benchmark/R2D2/SymbolicBehaviourBenchmark/rl_hiddenstate_policy.py b/benchmark/R2D2/SymbolicBehaviourBenchmark/rl_hiddenstate_policy.py new file mode 100644 index 00000000..95902b76 --- /dev/null +++ b/benchmark/R2D2/SymbolicBehaviourBenchmark/rl_hiddenstate_policy.py @@ -0,0 +1,125 @@ +from typing import List, Dict, Optional + +import torch +import torch.nn as nn + +from regym.rl_algorithms.agents.agent import Agent +from regym.rl_algorithms.utils import copy_hdict + +def extract_subtrees( + in_dict: Dict, + node_id: str): + ''' + Extracts a copy of subtree whose root is named :param node_id: from :param in_dict:. + ''' + queue = [in_dict] + pointer = None + + subtrees = [] + + while len(queue): + pointer = queue.pop(0) + if not isinstance(pointer, dict): continue + for k in pointer.keys(): + if node_id==k: + subtrees.append( + copy_hdict(pointer[k]) + ) + else: + queue.append(pointer[k]) + + return subtrees + +class RLHiddenStatePolicy(nn.Module): + def __init__( + self, + agent:Agent + ): + """ + + """ + super(RLHiddenStatePolicy, self).__init__() + self.model = agent + # TODO remove or update the following as it does not matter + # since we are only ever using one such actor... + self.player_idx = 0 + + def get_hiddens(self, info=None, from_pred=None): + if from_pred is None: + rnn_states = self.model.get_rnn_states() + else: + rnn_states = from_pred['next_rnn_states'] + # Extract 'hidden''s list: + hiddens = extract_subtrees(in_dict=rnn_states, node_id='hidden') + # List[List[Tensor]] + + vdn = self.model.kwargs.get('vdn', False) + vdn_nbr_players = self.model.kwargs.get('vdn_nbr_players', 2) + + nbr_rnn_modules = len(hiddens[0]) + batch_size = hiddens[0][0].shape[0] + + mult = 0 + if vdn and batch_size!=1: + batch_size = batch_size // vdn_nbr_players + mult = self.player_idx + + hiddens = torch.stack( + [ + torch.cat( + [hiddens[0][part_id][mult*batch_size+actor_id].reshape(-1) for part_id in range(nbr_rnn_modules)], + dim=0, + ) + for actor_id in range(batch_size) + ], + dim=0, + ) + # batch_size x nbr_parts*hidden_dims + + return hiddens + + def get_hidden_state_dim(self): + hiddens = self.get_hiddens() + return hiddens.shape[-1] + + def clone(self, training=False): + return RLHiddenStatePolicy( + agent=self.model.clone(training=training), + ) + + def reset(self, batch_size:int, training:Optional[bool]=False): + self.model.set_nbr_actor(batch_size, vdn=False, training=training) + + def save_inner_state(self): + self.saved_inner_state = self.model.get_rnn_states() + + def restore_inner_state(self): + self.model.set_rnn_states(self.saved_inner_state) + + def get_nbr_actor(self): + return self.model.get_nbr_actor() + + def forward(self, x:object): + """ + :param x: + Object representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the model. + + Here, x:Dict containing the keys: + -'state': torch.Tensor containing the environment state. + -'infos': Dict containing the entry 'abstract_repr' that is + actually used by the :param model:RuleBasedAgentWrapper. + + :return hiddens: + torch.Tensor concatenations of hidden states. + """ + + #log_p_a = self.model.query_action(**x) + pred_dict = self.model.query_action(**x) + # batch_size x action_space_dim + + hiddens = self.get_hiddens(info=x.get('infos', None), from_pred=pred_dict) + # batch_size x nbr_parts*hidden_dims + extra_dim if self.augmented + + return hiddens diff --git a/benchmark/R2D2/SymbolicBehaviourBenchmark/s2b_r2d2_sad_vdn_benchmark_config.yaml b/benchmark/R2D2/SymbolicBehaviourBenchmark/s2b_r2d2_sad_vdn_benchmark_config.yaml new file mode 100644 index 00000000..3aee9e05 --- /dev/null +++ b/benchmark/R2D2/SymbolicBehaviourBenchmark/s2b_r2d2_sad_vdn_benchmark_config.yaml @@ -0,0 +1,483 @@ +extra_hyperparameters: &extra_hyperparameters + lr_account_for_nbr_actor: False + weights_decay_lambda: 0.0 + weights_entropy_lambda: 0.0 #01 + use_target_to_gather_data: False + + #################################### + # New hyperparameters: + PER_compute_initial_priority: False + ##################################### + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: True + sequence_replay_unroll_length: 80 + sequence_replay_overlap_length: 40 + sequence_replay_burn_in_length: 20 + + sequence_replay_PER_eta: 0.9 + + vdn: False + vdn_nbr_players: 2 + +LargeMLP: &LargeMLP + phi_arch: 'MLP' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + #phi_arch_kernels: [8, 4, 3] + #phi_arch_strides: [4, 2, 1] + #phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 128 + phi_arch_hidden_units: [512,256] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'communication_channel':{ + shape: [55,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'other_agent_id':{ + shape: [10,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'role_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_result':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [128, 128] + +LargeMLP_SAD: &LargeMLP_SAD + sad: True + + phi_arch: 'MLP' #-LSTM-RNN' + actor_arch: 'None' + critic_arch: 'LSTM-RNN' + + # Phi Body: + #phi_arch_channels: ['BN32', 'BN64', 'BN64'] + #phi_arch_channels: [32, 64, 64] + #phi_arch_kernels: [8, 4, 3] + #phi_arch_kernels: [3, 3, 3] + #phi_arch_strides: [4, 2, 1] + #phi_arch_strides: [2, 2, 1] + #phi_arch_paddings: [1, 1, 1] + phi_arch_feature_dim: 128 + phi_arch_hidden_units: [512, 256] + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'communication_channel':{ + shape: [55,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'round_id':{ + shape: [4,], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'other_agent_id':{ + shape: [10,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'role_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_result':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + + } + + # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output. + # Value is a tuple where the first element is the expected shape of the extra input, + # and the second item is the location where the input should be stored in the framestate. + # Parsing of the shape will infer where to fetch the value when encountering a string. + + # Actor architecture: + actor_arch_hidden_units: [] + # Critic architecture: + #critic_arch_feature_dim: 32 + critic_arch_hidden_units: [128, 128] + + +r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + #observation_resize_dim: 21 #56 + + dueling: True + noisy: False + n_step: 3 + + use_PER: True + PER_alpha: 0.9 + PER_beta: 0.6 + + replay_capacity: 5242880 # in terms of experience #1e6 + min_capacity: 4e5 #in terms of experiences... #1e4 + replay_period: 1 + + actor_models_update_steps_interval: 10 #considering only 1 actor's steps. + + discount: 0.999 + use_cuda: True + gradient_clip: 0.5 + batch_size: 128 + tau: 4.0e-4 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + + epsstart: 1.0 + epsend: 0.1 + epsdecay: 10000 + eps_greedy_alpha: 7.0 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: False + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + + burn_in: False + sequence_replay_unroll_length: 40 + sequence_replay_overlap_length: 10 + sequence_replay_burn_in_length: 0 + + sequence_replay_PER_eta: 0.9 + + <<: *LargeMLP_SAD + <<: *extra_hyperparameters + + +experiment: + tasks: [{ + 'env-id': 'SymbolicBehaviourBenchmark-ReceptiveConstructiveTestEnv-v0', + 'env-config': { + "nbr_communication_rounds": 3, + "vocab_size": 6, + "max_sentence_length": 1, + "nbr_latents": 3, + "min_nbr_values_per_latent": 2, + "max_nbr_values_per_latent": 5, + "nbr_object_centric_samples": 1, + "nbr_distractors": 3, + "use_communication_channel_permutations": False, + "allow_listener_query": False, + }, + + 'run-id': 'Train-Reward1/venv64/V6-MSL1-NCR3-L3Min2Max5-Distr3-NoCommPerm', + + 'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone', + + #'nbr_actor': 128, + #'nbr_actor': 100, + 'nbr_actor': 64, + #'nbr_actor': 32, + #'nbr_actor': 16, + #'nbr_actor': 8, + #'nbr_frame_skipping': 4, + #'nbr_frame_stacking': 4, + #'grayscale': True, + #'single_life_episode': True, #False, + #'nbr_max_random_steps': 30, + #'sad': False, + 'sad': True, + #'vdn': False, + 'vdn': True, + #"otherplay": True, + "otherplay": False, + 'clip_reward': False, + 'previous_reward_action': True, + #'observation_resize_dim': (21,21), #(56,56), + 'observation_resize_dim': 56, #(56,56), + # + 'reload': 'None', + }, + ] + experiment_id: 'r2d2_s2b_debug' + benchmarking_episodes: 10 + benchmarking_interval: 1.0e3 + benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20 + #benchmarking_record_episode_interval: 1.0e20 + train_observation_budget: 1.0e7 + seed: 1 + +agents: + SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + <<: *r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0 + actor_models_update_steps_interval: 1 #considering only 1 actor's steps. + + vdn: True + vdn_nbr_players: 2 + + batch_size: 32 + learning_rate: 6.25e-5 + adam_eps: 1.5e-5 + discount: 0.997 + gradient_clip: 5.0 + # ...not specified in r2d2 paper but in Ape-X, + # and r2d2 paper says that missing hyper-param + # are the same as ape-X + + replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + min_capacity: 2e4 #in terms of experiences... #1e4 + + PER_compute_initial_priority: False + PER_beta_increase_interval: None #2e5 + + double: True + dueling: True + noisy: False + n_step: 3 + tau: 4.0e-4 + + sequence_replay_use_online_states: True + sequence_replay_use_zero_initial_states: False + sequence_replay_store_on_terminal: True + + r2d2_loss_masking: True + r2d2_loss_masking_n_step_regularisation: True + r2d2_bellman_target_SAD: False + + burn_in: False + sequence_replay_unroll_length: 100 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + + + epsstart: 1.0 + epsend: 0.05 + epsdecay: 30000 #1000000 + + # ape-X and r2d2 keep it constant over each actor + # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors) + # with base_eps=0.4 and \alpha = 7... + eps_greedy_alpha: 7.0 + + 3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone: + <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone + # DEBUG: + min_capacity: 1e3 + + #weights_entropy_lambda: 0.0 + #weights_entropy_lambda: 0.1 + weights_entropy_lambda: 0.001 #01 + + #vdn: False + vdn: True + vdn_nbr_players: 2 + #sad: False + sad: True + + + learning_rate: 6.25e-5 + #adam_eps: 1.5e-5 + #learning_rate: 1.0e-3 + #adam_eps: 1.0e-8 + adam_eps: 1.0e-12 + #adam_eps: 1.0e-15 + + replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6 + #replay_capacity: 1e5 + #min_capacity: 3e4 #in terms of experiences... #1e4 + + n_step: 3 + #n_step: 7 + + #tau: 4.0e-4 + #tau: 1.0e-5 + + #sequence_replay_overlap_length: 0 + #sequence_replay_overlap_length: 50 + + batch_size: 128 + + burn_in: False + #burn_in: True + + sequence_replay_unroll_length: 150 + sequence_replay_overlap_length: 0 + sequence_replay_burn_in_length: 0 + # #sequence_replay_burn_in_length: 10 + + # sequence_replay_unroll_length: 100 + # sequence_replay_overlap_length: 50 + # sequence_replay_burn_in_length: 0 + + epsend: 0.4 + eps_greedy_alpha: 2.0 + + # Architecture: + #critic_arch: 'LSTM-RNN' + #critic_arch_hidden_units: [512, 512] + #critic_arch_hidden_units: [512] + #use_relu_after_rnn: False + + # normal arch: + # critic_arch: 'MLP-LSTM-RNN' + # use_relu_after_rnn: True + # #use_relu_after_rnn: False + # critic_arch_feature_dim: 512 + # critic_arch_hidden_units: [512] + + # Arch2: + critic_arch: 'MLP-LSTM-RNN2' + use_relu_after_rnn: False #True + use_residual_connection: True + critic_arch_linear_hidden_units: [512, 256] + critic_arch_feature_dim: 128 + critic_arch_hidden_units: [128, 128] + + # Arch 3: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [128] + # critic_arch_feature_dim: 64 + # critic_arch_hidden_units: [64] + + #Arch 4: + # critic_arch: 'MLP-LSTM-RNN2' + # use_relu_after_rnn: True + # critic_arch_linear_hidden_units: [512, 256] + # critic_arch_hidden_units: [256] + # critic_arch_linear_post_hidden_units: [256] + # critic_arch_feature_dim: 128 + + extra_inputs_infos: { + 'previous_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_action':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'communication_channel':{ + shape: [7,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'round_id':{ + shape: [4,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'other_agent_id':{ + shape: [10,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'role_id':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_result':{ + shape: [2,], + target_location: ['critic_body', 'extra_inputs'] + }, + 'previous_game_reward':{ + shape: [1,], + target_location: ['critic_body', 'extra_inputs'] + }, + + 'action_mask':{ + shape: ['task.action_dim',], + target_location: ['critic_body', 'extra_inputs'] + }, + 'legal_actions':{ + shape: ['task.action_dim',], + target_location: ['head', 'extra_inputs'] + }, + + ######################## + # WITH SAD: + ######################## + 'greedy_action':{ + shape: [31,], #[6223,], + target_location: ['critic_body', 'extra_inputs'] + }, + ######################## + ######################## + + } + + + + diff --git a/regym/__init__.py b/regym/__init__.py index b439ac86..673300e0 100755 --- a/regym/__init__.py +++ b/regym/__init__.py @@ -26,3 +26,4 @@ def set(self, val): from . import rl_loops from . import rl_algorithms from . import logging_server +from . import pubsub_manager \ No newline at end of file diff --git a/regym/environments/envs/CoMaze/comaze_install.sh b/regym/environments/envs/CoMaze/comaze_install.sh new file mode 100755 index 00000000..b4dc9e89 --- /dev/null +++ b/regym/environments/envs/CoMaze/comaze_install.sh @@ -0,0 +1,4 @@ +git clone https://github.com/Near32/comaze-gym +cd comaze-gym +pip install -e . + diff --git a/regym/environments/envs/gym_envs/n_bits_swap_env.py b/regym/environments/envs/gym_envs/n_bits_swap_env.py index 8d9bbe93..6159ae6b 100644 --- a/regym/environments/envs/gym_envs/n_bits_swap_env.py +++ b/regym/environments/envs/gym_envs/n_bits_swap_env.py @@ -47,7 +47,16 @@ def reset(self): self.state = self.np_random.randint(2, size=self.n) if not self.fixed_goal: self.goal = self.np_random.randint(2, size=self.n) - return self._get_obs() + + info = {'latents': + { 's': [self.state.copy()], + 'succ_s': [self.state.copy()], + 'achieved_goal': [self.state.copy()], + 'desired_goal': [self.goal.copy()], + } + } + + return self._get_obs(), info def step(self, action): assert(action < self.n) @@ -60,10 +69,10 @@ def step(self, action): terminal = True if reward >= -0.5 or self.nbr_steps >= self.max_episode_steps else False info = {'latents': - { 's': init_state.copy(), - 'succ_s': self.state.copy(), - 'achieved_goal': self.state.copy(), - 'desired_goal': self.goal.copy() + { 's': [init_state.copy()], + 'succ_s': [self.state.copy()], + 'achieved_goal': [self.state.copy()], + 'desired_goal': [self.goal.copy()], } } diff --git a/regym/environments/parse_environment.py b/regym/environments/parse_environment.py index f9f2c987..7613e4ef 100755 --- a/regym/environments/parse_environment.py +++ b/regym/environments/parse_environment.py @@ -1,3 +1,5 @@ +from typing import Dict, Any + import gym from .gym_parser import parse_gym_environment @@ -12,7 +14,9 @@ def generate_task(env_name: str, env_type: EnvType = EnvType.SINGLE_AGENT, nbr_parallel_env: int = 1, wrapping_fn: object = None, - test_wrapping_fn: object = None, + test_wrapping_fn: object = None, + env_config: Dict[str,Any] = {}, + test_env_config: Dict[str,Any] = {}, seed: int = 0, test_seed: int = 1, train_video_recording_episode_period: int = None, @@ -51,7 +55,7 @@ def generate_task(env_name: str, env = None if is_gym_environment and is_unity_environment: raise ValueError(f'{env_name} exists as both a Gym and an Unity environment. Rename Unity environment to remove duplicate problem.') elif is_gym_environment: - env = gym.make(env_name) + env = gym.make(env_name, **env_config) env.seed(seed) if wrapping_fn is not None: env = wrapping_fn(env=env) @@ -59,8 +63,8 @@ def generate_task(env_name: str, elif is_unity_environment: task = parse_unity_environment(env_name, env_type) else: raise ValueError(f'Environment \'{env_name}\' was not recognized as either a Gym nor a Unity environment') - env_creator = EnvironmentCreator(env_name, is_unity_environment, is_gym_environment, wrapping_fn=wrapping_fn) - test_env_creator = EnvironmentCreator(env_name, is_unity_environment, is_gym_environment, wrapping_fn=test_wrapping_fn) + env_creator = EnvironmentCreator(env_name, is_unity_environment, is_gym_environment, wrapping_fn=wrapping_fn, env_config=env_config) + test_env_creator = EnvironmentCreator(env_name, is_unity_environment, is_gym_environment, wrapping_fn=test_wrapping_fn, env_config=test_env_config) task = Task(task.name, #ParallelEnv(env_creator, nbr_parallel_env, seed=seed), diff --git a/regym/environments/utils.py b/regym/environments/utils.py index f6da84b4..3459edee 100755 --- a/regym/environments/utils.py +++ b/regym/environments/utils.py @@ -1,15 +1,16 @@ import gym class EnvironmentCreator(): - def __init__(self, environment_name_cli, is_unity_environment, is_gym_environment, wrapping_fn=None): + def __init__(self, environment_name_cli, is_unity_environment, is_gym_environment, wrapping_fn=None, env_config={}): self.environment_name = environment_name_cli self.is_unity_environment = is_unity_environment self.is_gym_environment = is_gym_environment self.wrapping_fn = wrapping_fn + self.env_config = env_config def __call__(self, worker_id=None, seed=0): - if self.is_gym_environment: - env = gym.make(self.environment_name) + if self.is_gym_environment: + env = gym.make(self.environment_name, **self.env_config) env.seed(seed) if self.wrapping_fn is not None: env = self.wrapping_fn(env=env) return env diff --git a/regym/environments/vec_env.py b/regym/environments/vec_env.py index 17eaa6b3..89e3d8ba 100644 --- a/regym/environments/vec_env.py +++ b/regym/environments/vec_env.py @@ -55,7 +55,12 @@ def action_space(self): if self.env_processes[0] is None: self.launch_env_process(idx=0) return self.env_processes[0].action_space - + @property + def unwrapped_env(self): + if self.env_processes[0] is None: + self.launch_env_process(idx=0) + return self.env_processes[0] + def seed(self, seed): self.seed = seed @@ -123,6 +128,16 @@ def get_from_queue(self, idx, exhaust_first_when_failure=False): def put_action_in_queue(self, action, idx): self.env_queues[idx]['out'] = self.env_processes[idx].step(action) + def render(self, render_mode="rgb_array", env_indices=None) : + if env_indices is None: env_indices = range(self.nbr_parallel_env) + + observations = [] + for idx in env_indices: + obs = self.env_processes[idx].render(render_mode) + observations.append(obs) + + return observations + def reset(self, env_configs=None, env_indices=None) : if env_indices is None: env_indices = range(self.nbr_parallel_env) @@ -181,7 +196,12 @@ def reset(self, env_configs=None, env_indices=None) : self.dones[idx] = False self.init_reward = [] - return copy.deepcopy([per_env_obs, per_env_infos]) + output_dict = { + "observations":per_env_obs, + "info":per_env_infos + } + + return copy.deepcopy(output_dict) def step(self, action_vector, only_progress_non_terminated=True): observations = [] @@ -213,13 +233,17 @@ def step(self, action_vector, only_progress_non_terminated=True): obs, r, done, info = experience if len(self.init_reward) Module: + return CoMazeGoalOrderingPredictionModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class CoMazeGoalOrderingPredictionModule(Module): + def __init__( + self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ): + """ + """ + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":"modules:environment_module:trajectories", + "filtering_signal":"modules:environment_module:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + + # "observations":"modules:environment_module:observations", + # "infos":"modules:environment_module:info", + # "actions":"modules:environment_module:actions", + # "dones":"modules:environment_module:done", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(CoMazeGoalOrderingPredictionModule, self).__init__( + id=id, + type="CoMazeGoalOrderingPredictionModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.biasing = self.config.get('biasing', False) + self.nbr_players = self.config.get('nbr_players', 2) + self.player_id = self.config.get('player_id', 0) + self.metric = self.config['metric'] + + self.iteration = 0 + self.sampling_fraction = 5 + self.sampling_period = 10.0 + + self.secretgoalStr2id = {"RED":0, "YELLOW":1, "BLUE":2, "GREEN":3} + #self.id2SecretgoalStr = dict(zip(self.secretgoalStr2id.values(), self.secretgoalStr2id.keys())) + """ + self.labels = {} + label_id = 0 + for g1 in range(5): + for g2 in range(5): + for g3 in range(5): + for g4 in range(5): + self.labels[[g1,g2,g3,g4]] = label_id + label_id += 1 + """ + + def parameters(self): + return self.metric.prediction_net.parameters() + + def build_goal_ordering_label(self, info_dict): + reached_goals_str = info_dict['abstract_repr']['reached_goals'] + reached_goals_ids = [self.secretgoalStr2id[g] for g in reached_goals_str] + ''' + Issues: + 1) not necessarily reaching all goals + ''' + # Adding dummy goal if unreached: + dummy_goal_id = 4 + while len(reached_goals_ids)<4: reached_goals_ids.append(dummy_goal_id) + + return torch.Tensor(reached_goals_ids).reshape((1,4)) + + """label = self.labels[reached_goals_ids] + return label*torch.ones(1) + """ + + def build_rules_label(self, info_dict): + rules_labels = torch.zeros(1,4) + for pid, sgr in enumerate(info_dict['abstract_repr']["secretGoalRule"]): + # earlier: + rules_labels[:,pid*2] = self.secretgoalStr2id[sgr.earlierGoal.color] + # later: + rules_labels[:,pid*2+1] = self.secretgoalStr2id[sgr.laterGoal.color] + + return rules_labels + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + epoch = input_streams_dict["epoch"] + + filtering_signal = input_streams_dict["filtering_signal"] + trajectories = input_streams_dict["trajectories"] + #compute = True + compute = True #(epoch >= 50) + + self.iteration += 1 + #if (compute and np.random.random() < 1.0/self.sampling_period) or filtering_signal: + if compute and (((self.iteration % self.sampling_period) == 0) or filtering_signal): + if filtering_signal: + self.actions = [ + [ + exp[1] + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, (, keys if 'sad'==True) 1) + + # Formatting of kwargs: + # - 'state': observations + # - 'infos': infos + self.x = [ + [ + { + 'state':exp[0], # for _ in range(self.nbr_players)], # see environment_module for indices... + 'infos':[exp[6]], # for _ in range(self.nbr_players)], + 'as_logit':True, + } + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, keys:values) + + self.goal_ordering_labels = [ + self.build_goal_ordering_label(traj[self.player_id][-1][6]) + for traj in trajectories + ] + + self.rules_labels = [ + self.build_rules_label(traj[self.player_id][-1][6]) + for traj in trajectories + ] + + x = self.x + goal_ordering_labels = self.goal_ordering_labels + rules_labels = self.rules_labels + else: + if not hasattr(self, 'x'): + return outputs_stream_dict + + if filtering_signal: + indices = list(range(len(self.x))) + else: + indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//self.sampling_fraction, replace=False) + + x = [traj for idx, traj in enumerate(self.x) if idx in indices] + goal_ordering_labels = [labels for idx, labels in enumerate(self.goal_ordering_labels) if idx in indices] + rules_labels = [labels for idx, labels in enumerate(self.rules_labels) if idx in indices] + + mask = None + + ## Measure: + output_dict = self.metric.compute_goal_ordering_prediction_loss( + x=x, + y=goal_ordering_labels, + yp=rules_labels, + mask=mask, + biasing=self.biasing, + ) + L_gop = output_dict['l_gop'] + # batch_size + gop_accuracy = output_dict['per_actor_gop_accuracy'] + # batch_size + q1_correct_gop = output_dict['per_actor_acc_distr_q1'] + # batch_size + + + L_rp = output_dict['l_rp'] + # batch_size + rp_accuracy = output_dict['per_actor_rp_accuracy'] + # batch_size + q1_correct_rp = output_dict['per_actor_rp_acc_distr_q1'] + # batch_size + + logs_dict[f"{mode}/{self.id}/GoalOrderingPredictionAccuracy/{'Eval' if filtering_signal else 'Sample'}"] = gop_accuracy.mean() + logs_dict[f"{mode}/{self.id}/CorrectGoalOrderingPrediction-Q1/{'Eval' if filtering_signal else 'Sample'}"] = q1_correct_gop.mean() + + logs_dict[f"{mode}/{self.id}/RulesPredictionAccuracy/{'Eval' if filtering_signal else 'Sample'}"] = rp_accuracy.mean() + logs_dict[f"{mode}/{self.id}/RulesPrediction-Q1/{'Eval' if filtering_signal else 'Sample'}"] = q1_correct_rp.mean() + + losses_dict = input_streams_dict["losses_dict"] + #losses_dict[f"{mode}/{self.id}/GoalOrderingPredictionLoss/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_gop] + losses_dict[f"{mode}/{self.id}/RulesPredictionLoss/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_rp] + + return outputs_stream_dict + diff --git a/regym/modules/current_agents_module.py b/regym/modules/current_agents_module.py new file mode 100644 index 00000000..bc068376 --- /dev/null +++ b/regym/modules/current_agents_module.py @@ -0,0 +1,39 @@ +from typing import Dict, List + +from regym.modules.module import Module + + +class CurrentAgentsModule(Module): + def __init__(self, + id="current_agents", + agents:List[object]=None): + """ + This is a placeholder for the agents. It must not be part of any pipeline. + + :param id: str defining the ID of the module. + :param agents: List of Agents. + """ + + super(CurrentAgentsModule, self).__init__( + id=id, + type="CurrentAgentsModule", + config=None, + input_stream_ids=None + ) + + self.agents = agents + + def set_agents(self, agents): + self.agents = agents + + def parameters(self): + params = [] + for agent in self.agents: + params += agent.parameters() + return params + + def get_input_stream_ids(self): + raise NotImplementedError + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + raise NotImplementedError \ No newline at end of file diff --git a/regym/modules/environment_module.py b/regym/modules/environment_module.py new file mode 100644 index 00000000..7b594239 --- /dev/null +++ b/regym/modules/environment_module.py @@ -0,0 +1,465 @@ +from typing import Dict, List + +import os +import math +import copy +import time +from tqdm import tqdm +import numpy as np + +import regym +from tensorboardX import SummaryWriter +from regym.util.wrappers import VDNVecEnvWrapper +from regym.util.wrappers import SADVecEnvWrapper + +from regym.rl_algorithms.utils import _extract_from_rnn_states + +from torch.multiprocessing import Process +import ray + +import sys +import gc +import pdb +class ForkedPdb(pdb.Pdb): + """A Pdb subclass that may be used + from a forked multiprocessing child + """ + def interaction(self, *args, **kwargs): + _stdin = sys.stdin + try: + sys.stdin = open('/dev/stdin') + pdb.Pdb.interaction(self, *args, **kwargs) + finally: + sys.stdin = _stdin +#forkedPdb = ForkedPdb() + +from regym.modules.module import Module +from regym.rl_loops.multiagent_loops.marl_loop import test_agent + + +def build_EnvironmentModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None) -> Module: + return EnvironmentModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class EnvironmentModule(Module): + def __init__(self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None): + + default_input_stream_ids = { + #"logger":"modules:logger:ref", + #"logs_dict":"logs_dict", + + "iteration":"signals:iteration", + + "current_agents":"modules:current_agents:ref", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids.keys(): + input_stream_ids[default_id] = default_stream + + super(EnvironmentModule, self).__init__( + id=id, + type="EnvironmentModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.init = False + + self.task = self.config['task'] + self.env = self.task.env + + if self.config.get('sad', False): + self.env = SADVecEnvWrapper(self.env, nbr_actions=self.task.action_dim, otherplay=self.config.get('otherplay', False)) + if self.config.get('vdn', False): + self.env = VDNVecEnvWrapper(self.env, nbr_players=self.config['nbr_players']) + + self.test_env = self.task.test_env + if self.config.get('sad', False): + self.test_env = SADVecEnvWrapper(self.test_env, nbr_actions=self.task.action_dim, otherplay=self.config.get('otherplay', False)) + if self.config.get('vdn', False): + self.test_env = VDNVecEnvWrapper(self.test_env, nbr_players=self.config['nbr_players']) + + + def initialisation(self, input_streams_dict: Dict[str,object]) -> None: + self.init = True + print("Initialization of Environment Module: ...") + + self.observations = None + self.info = None + + self.agents = input_streams_dict["current_agents"].agents + self.sad = self.config.get('sad', False) + self.vdn = self.config.get('vdn', False) + + + self.obs_key = "observations" + self.info_key = "info" + self.action_key = "actions" + self.reward_key = "reward" + self.done_key = "done" + self.succ_obs_key = "succ_observations" + self.succ_info_key = "succ_info" + if self.vdn: + self.obs_key = "vdn_observations" + self.info_key = "vdn_info" + self.action_key = "vdn_actions" + self.reward_key = "vdn_reward" + self.done_key = "vdn_done" + self.succ_obs_key = "vdn_succ_observations" + self.succ_info_key = "vdn_succ_info" + + self.nbr_actors = self.env.get_nbr_envs() + self.nbr_players = self.config['nbr_players'] + + self.done = [False]*self.nbr_actors + + for agent in self.agents: + agent.set_nbr_actor(self.nbr_actors) + + self.per_actor_per_player_trajectories = [ + [ + list() for p in range(self.nbr_players) + ] + for a in range(self.nbr_actors) + ] + self.trajectories = list() + self.total_returns = list() + self.positive_total_returns = list() + self.total_int_returns = list() + self.episode_lengths = list() + + self.obs_count = self.agents[0].get_experience_count() if hasattr(self.agents[0], "get_experience_count") else 0 + self.episode_count = 0 + self.episode_count_record = 0 + self.sample_episode_count = 0 + + if isinstance(self.config['sum_writer'], str): + sum_writer_path = os.path.join(self.config['sum_writer'], 'actor.log') + self.sum_writer = SummaryWriter(sum_writer_path, flush_secs=1) + else: + self.sum_writer = self.config['sum_writer'] + + for agent in self.agents: + agent_algo = getattr(agent, "algorithm", None) + if agent_algo is None: continue + if agent.training: + agent_algo.summary_writer = self.sum_writer + else: + agent_algo.summary_writer = None + + self.epoch = 0 + + self.pbar = tqdm( + total=self.config['max_obs_count'], + position=0, + ) + self.pbar.update(self.obs_count) + + print("Initialization of Environment Module: DONE") + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + outputs_stream_dict["new_trajectories_published"] = False + + if not self.init: + self.initialisation(input_streams_dict) + + if self.observations is None: + env_reset_output_dict = self.env.reset(env_configs=self.config.get('env_configs', None)) + self.observations = env_reset_output_dict[self.obs_key] + self.info = env_reset_output_dict[self.info_key] + if self.vdn: + self.nonvdn_observations = env_reset_output_dict["observations"] + self.nonvdn_info = env_reset_output_dict["info"] + + actions = [ + agent.take_action( + state=self.observations[agent_idx], + infos=self.info[agent_idx] + ) \ + if agent.training else \ + agent.query_action( + state=self.observations[agent_idx], + infos=self.info[agent_idx] + ) + for agent_idx, agent in enumerate(self.agents) + ] + + env_output_dict = self.env.step(actions) + succ_observations = env_output_dict[self.succ_obs_key] + reward = env_output_dict[self.reward_key] + done = env_output_dict[self.done_key] + succ_info = env_output_dict[self.succ_info_key] + + if self.vdn: + nonvdn_actions = env_output_dict['actions'] + nonvdn_succ_observations = env_output_dict['succ_observations'] + nonvdn_reward = env_output_dict['reward'] + nonvdn_done = env_output_dict['done'] + nonvdn_succ_info = env_output_dict['succ_info'] + + if self.config['training']: + for agent_idx, agent in enumerate(self.agents): + if agent.training: + agent.handle_experience( + s=self.observations[agent_idx], + a=actions[agent_idx], + r=reward[agent_idx], + succ_s=succ_observations[agent_idx], + done=done, + infos=self.info[agent_idx], + ) + + if self.sad and isinstance(actions[0], dict): + actions = [ + a["action"] + for a in actions + ] + + for actor_index in range(self.nbr_actors): + self.obs_count += 1 + self.pbar.update(1) + + for hook in self.config['step_hooks']: + for agent in self.agents: + hook(self.env, agent, self.obs_count) + + # Bookkeeping of the actors whose episode just ended: + done_condition = ('real_done' in succ_info[0][actor_index] \ + and succ_info[0][actor_index]['real_done']) \ + or ('real_done' not in succ_info[0][actor_index] \ + and done[actor_index]) + if done_condition: + self.update_count = self.agents[0].get_update_count() + self.episode_count += 1 + self.episode_count_record += 1 + env_reset_output_dict = self.env.reset(env_configs=self.config.get('env_configs', None), env_indices=[actor_index]) + succ_observations = env_reset_output_dict[self.obs_key] + succ_info = env_reset_output_dict[self.info_key] + if self.vdn: + nonvdn_succ_observations = env_reset_output_dict['observations'] + nonvdn_succ_info = env_reset_output_dict['info'] + + for agent_idx, agent in enumerate(self.agents): + agent.reset_actors(indices=[actor_index]) + + # Logging: + self.trajectories.append(self.per_actor_per_player_trajectories[actor_index]) + + # Only care about logging player 0: + player_id = 0 + traj = self.trajectories[-1][player_id] + self.total_returns.append(sum([ exp[2] for exp in traj])) + self.positive_total_returns.append(sum([ exp[2] if exp[2]>0 else 0.0 for exp in traj])) + self.total_int_returns.append(sum([ exp[3] for exp in traj])) + self.episode_lengths.append(len(traj)) + + if self.sum_writer is not None: + self.sum_writer.add_scalar('Training/TotalReturn', self.total_returns[-1], self.episode_count) + self.sum_writer.add_scalar('PerObservation/TotalReturn', self.total_returns[-1], self.obs_count) + self.sum_writer.add_scalar('PerUpdate/TotalReturn', self.total_returns[-1], self.update_count) + + self.sum_writer.add_scalar('Training/PositiveTotalReturn', self.positive_total_returns[-1], self.episode_count) + self.sum_writer.add_scalar('PerObservation/PositiveTotalReturn', self.positive_total_returns[-1], self.obs_count) + self.sum_writer.add_scalar('PerUpdate/PositiveTotalReturn', self.positive_total_returns[-1], self.update_count) + + if actor_index == 0: + self.sample_episode_count += 1 + #sum_writer.add_scalar(f'data/reward_{actor_index}', total_returns[-1], sample_episode_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_PositiveReward', positive_total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerUpdate/Actor_{actor_index}_Reward', total_returns[-1], self.update_count) + #sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) + self.sum_writer.flush() + + if len(self.trajectories) >= self.nbr_actors: + mean_total_return = sum( self.total_returns) / len(self.trajectories) + std_ext_return = math.sqrt( sum( [math.pow( r-mean_total_return ,2) for r in self.total_returns]) / len(self.total_returns) ) + mean_positive_total_return = sum( self.positive_total_returns) / len(self.trajectories) + std_ext_positive_return = math.sqrt( sum( [math.pow( r-mean_positive_total_return ,2) for r in self.positive_total_returns]) / len(self.positive_total_returns) ) + mean_total_int_return = sum( self.total_int_returns) / len(self.trajectories) + std_int_return = math.sqrt( sum( [math.pow( r-mean_total_int_return ,2) for r in self.total_int_returns]) / len(self.total_int_returns) ) + mean_episode_length = sum( self.episode_lengths) / len(self.trajectories) + std_episode_length = math.sqrt( sum( [math.pow( l-mean_episode_length ,2) for l in self.episode_lengths]) / len(self.episode_lengths) ) + + if self.sum_writer is not None: + self.sum_writer.add_scalar('Training/StdIntReturn', std_int_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('Training/StdExtReturn', std_ext_return, self.episode_count // self.nbr_actors) + + self.sum_writer.add_scalar('Training/MeanTotalReturn', mean_total_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanTotalReturn', mean_total_return, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanTotalReturn', mean_total_return, self.update_count) + self.sum_writer.add_scalar('Training/MeanPositiveTotalReturn', mean_positive_total_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanPositiveTotalReturn', mean_positive_total_return, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanPositiveTotalReturn', mean_positive_total_return, self.update_count) + self.sum_writer.add_scalar('Training/MeanTotalIntReturn', mean_total_int_return, self.episode_count // self.nbr_actors) + + self.sum_writer.add_scalar('Training/MeanEpisodeLength', mean_episode_length, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanEpisodeLength', mean_episode_length, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanEpisodeLength', mean_episode_length, self.update_count) + self.sum_writer.add_scalar('Training/StdEpisodeLength', std_episode_length, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/StdEpisodeLength', std_episode_length, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/StdEpisodeLength', std_episode_length, self.update_count) + self.sum_writer.flush() + + # bookkeeping: + outputs_stream_dict["trajectories"] = copy.deepcopy(self.trajectories) + outputs_stream_dict["new_trajectories_published"] = True + self.epoch += 1 + + # reset : + self.trajectories = list() + self.total_returns = list() + self.positive_total_returns = list() + self.total_int_returns = list() + self.episode_lengths = list() + + self.per_actor_per_player_trajectories[actor_index] = [ + list() for p in range(self.nbr_players) + ] + + if self.vdn: + obs = self.nonvdn_observations + act = nonvdn_actions + succ_obs = nonvdn_succ_observations + rew = nonvdn_reward + d = nonvdn_done + info = self.nonvdn_info + else: + obs = self.observations + act = actions + succ_obs = succ_observations + rew = reward + d = done + info = self.info + + for player_index in range(self.nbr_players): + pa_obs = obs[player_index][actor_index:actor_index+1] + pa_a = act[player_index][actor_index:actor_index+1] + pa_r = rew[player_index][actor_index:actor_index+1] + pa_succ_obs = succ_obs[player_index][actor_index:actor_index+1] + pa_done = d[actor_index:actor_index+1] + pa_int_r = 0.0 + + """ + pa_info = _extract_from_rnn_states( + self.info[player_index], + actor_index, + post_process_fn=None + ) + """ + pa_info = info[player_index][actor_index] + + """ + if getattr(agent.algorithm, "use_rnd", False): + get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) + if callable(get_intrinsic_reward): + pa_int_r = agent.get_intrinsic_reward(actor_index) + """ + self.per_actor_per_player_trajectories[actor_index][player_index].append( + (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done, pa_info) + ) + + + if self.config['test_nbr_episode'] != 0 \ + and self.obs_count % self.config['test_obs_interval'] == 0: + save_traj = False + if (self.config['benchmarking_record_episode_interval'] is not None \ + and self.config['benchmarking_record_episode_interval']>0): + #save_traj = (self.obs_count%benchmarking_record_episode_interval==0) + save_traj = (self.episode_count_record // self.nbr_actors > self.config['benchmarking_record_episode_interval']) + if save_traj: + self.episode_count_record = 0 + + # TECHNICAL DEBT: clone_agent.get_update_count is failing because the update count param is None + # haven't figured out why is the cloning function making it None... + test_agent( + env=self.test_env, + agents=[agent.clone(training=False) for agent in self.agents], + update_count=self.agents[0].get_update_count(), + nbr_episode=self.config['test_nbr_episode'], + sum_writer=self.sum_writer, + iteration=self.obs_count, + base_path=self.config['base_path'], + save_traj=save_traj, + render_mode=self.config['render_mode'], + save_traj_length_divider=self.config['save_traj_length_divider'], + obs_key=self.obs_key, + succ_obs_key=self.succ_obs_key, + reward_key=self.reward_key, + done_key=self.done_key, + info_key=self.info_key, + succ_info_key=self.succ_info_key, + ) + + if self.obs_count % 10000 == 0: + for agent in self.agents: + if not hasattr(agent, 'save'): continue + agent.save(minimal=True) + print(f"Agent {agent} saved at: {agent.save_path}") + + + outputs_stream_dict[self.obs_key] = copy.deepcopy(self.observations) + outputs_stream_dict[self.info_key] = copy.deepcopy(self.info) + outputs_stream_dict[self.action_key] = actions + outputs_stream_dict[self.reward_key] = reward + outputs_stream_dict[self.done_key] = done + outputs_stream_dict[self.succ_obs_key] = succ_observations + outputs_stream_dict[self.succ_info_key] = succ_info + + if self.vdn: + outputs_stream_dict["observations"] = copy.deepcopy(self.nonvdn_observations) + outputs_stream_dict["info"] = copy.deepcopy(self.nonvdn_info) + outputs_stream_dict["actions"] = nonvdn_actions + outputs_stream_dict["reward"] = nonvdn_reward + outputs_stream_dict["done"] = nonvdn_done + outputs_stream_dict["succ_observations"] = nonvdn_succ_observations + outputs_stream_dict["succ_info"] = nonvdn_succ_info + + + self.observations = copy.deepcopy(succ_observations) + self.info = copy.deepcopy(succ_info) + if self.vdn: + self.nonvdn_observations = copy.deepcopy(nonvdn_succ_observations) + self.nonvdn_info = copy.deepcopy(nonvdn_succ_info) + + + outputs_stream_dict["signals:mode"] = 'train' + outputs_stream_dict["signals:epoch"] = self.epoch + + if self.obs_count >= self.config["max_obs_count"]: + outputs_stream_dict["signals:done_training"] = True + outputs_stream_dict["signals:trained_agents"] = self.agents + + if self.sum_writer is not None: + self.sum_writer.flush() + + self.env.close() + self.test_env.close() + self.init = False + + return outputs_stream_dict + else: + outputs_stream_dict["signals:done_training"] = False + + return copy.deepcopy(outputs_stream_dict) + + + + diff --git a/regym/modules/marl_environment_module.py b/regym/modules/marl_environment_module.py new file mode 100644 index 00000000..bdd09c2e --- /dev/null +++ b/regym/modules/marl_environment_module.py @@ -0,0 +1,506 @@ +from typing import Dict, List + +import os +import math +import copy +import time +from tqdm import tqdm +import numpy as np + +import regym +from tensorboardX import SummaryWriter +from regym.util.wrappers import VDNVecEnvWrapper +from regym.util.wrappers import SADVecEnvWrapper + +from regym.rl_algorithms.utils import _extract_from_rnn_states + +from torch.multiprocessing import Process +import ray + +import sys +import gc +import pdb +class ForkedPdb(pdb.Pdb): + """A Pdb subclass that may be used + from a forked multiprocessing child + """ + def interaction(self, *args, **kwargs): + _stdin = sys.stdin + try: + sys.stdin = open('/dev/stdin') + pdb.Pdb.interaction(self, *args, **kwargs) + finally: + sys.stdin = _stdin +#forkedPdb = ForkedPdb() + +from regym.modules.module import Module +from regym.rl_loops.multiagent_loops.marl_loop import test_agent + + +def build_MARLEnvironmentModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None) -> Module: + return MARLEnvironmentModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class MARLEnvironmentModule(Module): + def __init__(self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None): + + default_input_stream_ids = { + #"logger":"modules:logger:ref", + #"logs_dict":"logs_dict", + + "iteration":"signals:iteration", + + "current_agents":"modules:current_agents:ref", + "player_0":"modules:rl_agent_0:ref", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids.keys(): + input_stream_ids[default_id] = default_stream + + super(MARLEnvironmentModule, self).__init__( + id=id, + type="MARLEnvironmentModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.init = False + + self.task = self.config['task'] + self.env = self.task.env + + if self.config.get('sad', False): + self.env = SADVecEnvWrapper(self.env, nbr_actions=self.task.action_dim, otherplay=self.config.get('otherplay', False)) + if self.config.get('vdn', False): + self.env = VDNVecEnvWrapper(self.env, nbr_players=self.config['nbr_players']) + + self.test_env = self.task.test_env + if self.config.get('sad', False): + self.test_env = SADVecEnvWrapper(self.test_env, nbr_actions=self.task.action_dim, otherplay=self.config.get('otherplay', False)) + if self.config.get('vdn', False): + self.test_env = VDNVecEnvWrapper(self.test_env, nbr_players=self.config['nbr_players']) + + # Create placeholders for players: + self.nbr_agents = self.config['nbr_players'] + if self.config.get('vdn', False): + self.nbr_agents = 1 + + for player_idx in range(self.nbr_agents): + setattr(self, f"player_{player_idx}", dict()) + + def initialisation(self, input_streams_dict: Dict[str,object]) -> None: + self.init = True + print("Initialization of MARL Environment Module: ...") + + self.observations = None + self.info = None + + self.agents = input_streams_dict["current_agents"].agents + self.sad = self.config.get('sad', False) + self.vdn = self.config.get('vdn', False) + + + self.obs_key = "observations" + self.info_key = "info" + self.action_key = "actions" + self.reward_key = "reward" + self.done_key = "done" + self.succ_obs_key = "succ_observations" + self.succ_info_key = "succ_info" + if self.vdn: + self.obs_key = "vdn_observations" + self.info_key = "vdn_info" + self.action_key = "vdn_actions" + self.reward_key = "vdn_reward" + self.done_key = "vdn_done" + self.succ_obs_key = "vdn_succ_observations" + self.succ_info_key = "vdn_succ_info" + + self.nbr_actors = self.env.get_nbr_envs() + self.nbr_players = self.config['nbr_players'] + + self.done = [False]*self.nbr_actors + + for agent in self.agents: + agent.set_nbr_actor(self.nbr_actors) + + self.per_actor_per_player_trajectories = [ + [ + list() for p in range(self.nbr_players) + ] + for a in range(self.nbr_actors) + ] + self.trajectories = list() + self.total_returns = list() + self.positive_total_returns = list() + self.total_int_returns = list() + self.episode_lengths = list() + + self.obs_count = self.agents[0].get_experience_count() if hasattr(self.agents[0], "get_experience_count") else 0 + self.episode_count = 0 + self.episode_count_record = 0 + self.sample_episode_count = 0 + + if isinstance(self.config['sum_writer'], str): + sum_writer_path = os.path.join(self.config['sum_writer'], 'actor.log') + self.sum_writer = SummaryWriter(sum_writer_path, flush_secs=1) + else: + self.sum_writer = self.config['sum_writer'] + + for agent in self.agents: + agent_algo = getattr(agent, "algorithm", None) + if agent_algo is None: continue + if agent.training: + agent_algo.summary_writer = self.sum_writer + else: + agent_algo.summary_writer = None + + self.epoch = 0 + + self.pbar = tqdm( + total=self.config['max_obs_count'], + position=0, + ) + self.pbar.update(self.obs_count) + + print("Initialization of MARL Environment Module: DONE") + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + outputs_stream_dict["new_trajectories_published"] = False + outputs_stream_dict['reset_actors'] = [] + + if not self.init: + self.initialisation(input_streams_dict) + + if self.observations is None: + env_reset_output_dict = self.env.reset(env_configs=self.config.get('env_configs', None)) + self.observations = env_reset_output_dict[self.obs_key] + self.info = env_reset_output_dict[self.info_key] + if self.vdn: + self.nonvdn_observations = env_reset_output_dict["observations"] + self.nonvdn_info = env_reset_output_dict["info"] + + outputs_stream_dict[self.obs_key] = copy.deepcopy(self.observations) + outputs_stream_dict[self.info_key] = copy.deepcopy(self.info) + outputs_stream_dict[self.action_key] = None + outputs_stream_dict[self.reward_key] = None + outputs_stream_dict[self.done_key] = None + outputs_stream_dict[self.succ_obs_key] = None + outputs_stream_dict[self.succ_info_key] = None + + if self.vdn: + outputs_stream_dict["observations"] = copy.deepcopy(self.nonvdn_observations) + outputs_stream_dict["info"] = copy.deepcopy(self.nonvdn_info) + outputs_stream_dict["actions"] = None + outputs_stream_dict["reward"] = None + outputs_stream_dict["done"] = None + outputs_stream_dict["succ_observations"] = None + outputs_stream_dict["succ_info"] = None + + for pidx in range(self.nbr_agents): + pidx_d = getattr(self, f"player_{pidx}") + pidx_d['observations'] = None + pidx_d['infos'] = None + pidx_d['actions'] = None + pidx_d['succ_observations'] = self.observations[pidx] + pidx_d['succ_infos'] = self.info[pidx] + pidx_d['rewards'] = None + pidx_d['dones'] = None + + outputs_stream_dict["signals:mode"] = 'train' + outputs_stream_dict["signals:epoch"] = self.epoch + outputs_stream_dict["signals:done_training"] = False + + return copy.deepcopy(outputs_stream_dict) + + """ + actions = [ + getattr(f'player_{player_idx}').actions + for player_idx in range(self.nbr_agents) + ] + """ + actions = [ + input_streams_dict[f'player_{player_idx}'].actions + for player_idx in range(self.nbr_agents) + ] + + env_output_dict = self.env.step(actions) + succ_observations = env_output_dict[self.succ_obs_key] + reward = env_output_dict[self.reward_key] + done = env_output_dict[self.done_key] + succ_info = env_output_dict[self.succ_info_key] + + if self.vdn: + nonvdn_actions = env_output_dict['actions'] + nonvdn_succ_observations = env_output_dict['succ_observations'] + nonvdn_reward = env_output_dict['reward'] + nonvdn_done = env_output_dict['done'] + nonvdn_succ_info = env_output_dict['succ_info'] + + if self.sad and isinstance(actions[0], dict): + actions = [ + a["action"] + for a in actions + ] + + for actor_index in range(self.nbr_actors): + self.obs_count += 1 + self.pbar.update(1) + + for hook in self.config['step_hooks']: + for agent in self.agents: + hook(self.env, agent, self.obs_count) + + # Bookkeeping of the actors whose episode just ended: + done_condition = ('real_done' in succ_info[0][actor_index] \ + and succ_info[0][actor_index]['real_done']) \ + or ('real_done' not in succ_info[0][actor_index] \ + and done[actor_index]) + if done_condition: + self.update_count = self.agents[0].get_update_count() + self.episode_count += 1 + self.episode_count_record += 1 + env_reset_output_dict = self.env.reset(env_configs=self.config.get('env_configs', None), env_indices=[actor_index]) + succ_observations = env_reset_output_dict[self.obs_key] + succ_info = env_reset_output_dict[self.info_key] + if self.vdn: + nonvdn_succ_observations = env_reset_output_dict['observations'] + nonvdn_succ_info = env_reset_output_dict['info'] + + """ + for agent_idx, agent in enumerate(self.agents): + agent.reset_actors(indices=[actor_index]) + """ + outputs_stream_dict['reset_actors'].append(actor_index) + + # Logging: + self.trajectories.append(self.per_actor_per_player_trajectories[actor_index]) + + # Only care about logging player 0: + player_id = 0 + traj = self.trajectories[-1][player_id] + self.total_returns.append(sum([ exp[2] for exp in traj])) + self.positive_total_returns.append(sum([ exp[2] if exp[2]>0 else 0.0 for exp in traj])) + self.total_int_returns.append(sum([ exp[3] for exp in traj])) + self.episode_lengths.append(len(traj)) + + if self.sum_writer is not None: + self.sum_writer.add_scalar('Training/TotalReturn', self.total_returns[-1], self.episode_count) + self.sum_writer.add_scalar('PerObservation/TotalReturn', self.total_returns[-1], self.obs_count) + self.sum_writer.add_scalar('PerUpdate/TotalReturn', self.total_returns[-1], self.update_count) + + self.sum_writer.add_scalar('Training/PositiveTotalReturn', self.positive_total_returns[-1], self.episode_count) + self.sum_writer.add_scalar('PerObservation/PositiveTotalReturn', self.positive_total_returns[-1], self.obs_count) + self.sum_writer.add_scalar('PerUpdate/PositiveTotalReturn', self.positive_total_returns[-1], self.update_count) + + if actor_index == 0: + self.sample_episode_count += 1 + #sum_writer.add_scalar(f'data/reward_{actor_index}', total_returns[-1], sample_episode_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_PositiveReward', positive_total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerUpdate/Actor_{actor_index}_Reward', total_returns[-1], self.update_count) + #sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) + self.sum_writer.flush() + + if len(self.trajectories) >= self.nbr_actors: + mean_total_return = sum( self.total_returns) / len(self.trajectories) + std_ext_return = math.sqrt( sum( [math.pow( r-mean_total_return ,2) for r in self.total_returns]) / len(self.total_returns) ) + mean_positive_total_return = sum( self.positive_total_returns) / len(self.trajectories) + std_ext_positive_return = math.sqrt( sum( [math.pow( r-mean_positive_total_return ,2) for r in self.positive_total_returns]) / len(self.positive_total_returns) ) + mean_total_int_return = sum( self.total_int_returns) / len(self.trajectories) + std_int_return = math.sqrt( sum( [math.pow( r-mean_total_int_return ,2) for r in self.total_int_returns]) / len(self.total_int_returns) ) + mean_episode_length = sum( self.episode_lengths) / len(self.trajectories) + std_episode_length = math.sqrt( sum( [math.pow( l-mean_episode_length ,2) for l in self.episode_lengths]) / len(self.episode_lengths) ) + + if self.sum_writer is not None: + self.sum_writer.add_scalar('Training/StdIntReturn', std_int_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('Training/StdExtReturn', std_ext_return, self.episode_count // self.nbr_actors) + + self.sum_writer.add_scalar('Training/MeanTotalReturn', mean_total_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanTotalReturn', mean_total_return, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanTotalReturn', mean_total_return, self.update_count) + self.sum_writer.add_scalar('Training/MeanPositiveTotalReturn', mean_positive_total_return, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanPositiveTotalReturn', mean_positive_total_return, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanPositiveTotalReturn', mean_positive_total_return, self.update_count) + self.sum_writer.add_scalar('Training/MeanTotalIntReturn', mean_total_int_return, self.episode_count // self.nbr_actors) + + self.sum_writer.add_scalar('Training/MeanEpisodeLength', mean_episode_length, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/MeanEpisodeLength', mean_episode_length, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/MeanEpisodeLength', mean_episode_length, self.update_count) + self.sum_writer.add_scalar('Training/StdEpisodeLength', std_episode_length, self.episode_count // self.nbr_actors) + self.sum_writer.add_scalar('PerObservation/StdEpisodeLength', std_episode_length, self.obs_count) + self.sum_writer.add_scalar('PerUpdate/StdEpisodeLength', std_episode_length, self.update_count) + self.sum_writer.flush() + + # bookkeeping: + outputs_stream_dict["trajectories"] = copy.deepcopy(self.trajectories) + outputs_stream_dict["new_trajectories_published"] = True + self.epoch += 1 + + # reset : + self.trajectories = list() + self.total_returns = list() + self.positive_total_returns = list() + self.total_int_returns = list() + self.episode_lengths = list() + + self.per_actor_per_player_trajectories[actor_index] = [ + list() for p in range(self.nbr_players) + ] + + if self.vdn: + obs = self.nonvdn_observations + act = nonvdn_actions + succ_obs = nonvdn_succ_observations + rew = nonvdn_reward + d = nonvdn_done + info = self.nonvdn_info + else: + obs = self.observations + act = actions + succ_obs = succ_observations + rew = reward + d = done + info = self.info + + for player_index in range(self.nbr_players): + pa_obs = obs[player_index][actor_index:actor_index+1] + pa_a = act[player_index][actor_index:actor_index+1] + pa_r = rew[player_index][actor_index:actor_index+1] + pa_succ_obs = succ_obs[player_index][actor_index:actor_index+1] + pa_done = d[actor_index:actor_index+1] + pa_int_r = 0.0 + + """ + pa_info = _extract_from_rnn_states( + self.info[player_index], + actor_index, + post_process_fn=None + ) + """ + pa_info = info[player_index][actor_index] + + """ + if getattr(agent.algorithm, "use_rnd", False): + get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) + if callable(get_intrinsic_reward): + pa_int_r = agent.get_intrinsic_reward(actor_index) + """ + self.per_actor_per_player_trajectories[actor_index][player_index].append( + (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done, pa_info) + ) + + + if self.config['test_nbr_episode'] != 0 \ + and self.obs_count % self.config['test_obs_interval'] == 0: + save_traj = False + if (self.config['benchmarking_record_episode_interval'] is not None \ + and self.config['benchmarking_record_episode_interval']>0): + #save_traj = (self.obs_count%benchmarking_record_episode_interval==0) + save_traj = (self.episode_count_record // self.nbr_actors > self.config['benchmarking_record_episode_interval']) + if save_traj: + self.episode_count_record = 0 + + # TECHNICAL DEBT: clone_agent.get_update_count is failing because the update count param is None + # haven't figured out why is the cloning function making it None... + test_agent( + env=self.test_env, + agents=[agent.clone(training=False) for agent in self.agents], + update_count=self.agents[0].get_update_count(), + nbr_episode=self.config['test_nbr_episode'], + sum_writer=self.sum_writer, + iteration=self.obs_count, + base_path=self.config['base_path'], + save_traj=save_traj, + render_mode=self.config['render_mode'], + save_traj_length_divider=self.config['save_traj_length_divider'], + obs_key=self.obs_key, + succ_obs_key=self.succ_obs_key, + reward_key=self.reward_key, + done_key=self.done_key, + info_key=self.info_key, + succ_info_key=self.succ_info_key, + ) + + if self.obs_count % 10000 == 0: + for agent in self.agents: + if not hasattr(agent, 'save'): continue + agent.save(minimal=True) + print(f"Agent {agent} saved at: {agent.save_path}") + + + outputs_stream_dict[self.obs_key] = copy.deepcopy(self.observations) + outputs_stream_dict[self.info_key] = copy.deepcopy(self.info) + outputs_stream_dict[self.action_key] = actions + outputs_stream_dict[self.reward_key] = reward + outputs_stream_dict[self.done_key] = done + outputs_stream_dict[self.succ_obs_key] = succ_observations + outputs_stream_dict[self.succ_info_key] = succ_info + + if self.vdn: + outputs_stream_dict["observations"] = copy.deepcopy(self.nonvdn_observations) + outputs_stream_dict["info"] = copy.deepcopy(self.nonvdn_info) + outputs_stream_dict["actions"] = nonvdn_actions + outputs_stream_dict["reward"] = nonvdn_reward + outputs_stream_dict["done"] = nonvdn_done + outputs_stream_dict["succ_observations"] = nonvdn_succ_observations + outputs_stream_dict["succ_info"] = nonvdn_succ_info + + # Prepare player dicts for RLAgent modules: + for pidx in range(self.nbr_agents): + pidx_d = getattr(self, f"player_{pidx}") + pidx_d['observations'] = self.observations[pidx] + pidx_d['infos'] = self.info[pidx] + pidx_d['actions'] = actions[pidx] + pidx_d['succ_observations'] = succ_observations[pidx] + pidx_d['succ_infos'] = succ_info[pidx] + pidx_d['rewards'] = reward[pidx] + pidx_d['dones'] = done + + self.observations = copy.deepcopy(succ_observations) + self.info = copy.deepcopy(succ_info) + if self.vdn: + self.nonvdn_observations = copy.deepcopy(nonvdn_succ_observations) + self.nonvdn_info = copy.deepcopy(nonvdn_succ_info) + + + outputs_stream_dict["signals:mode"] = 'train' + outputs_stream_dict["signals:epoch"] = self.epoch + + if self.obs_count >= self.config["max_obs_count"]: + outputs_stream_dict["signals:done_training"] = True + outputs_stream_dict["signals:trained_agents"] = self.agents + + if self.sum_writer is not None: + self.sum_writer.flush() + + self.env.close() + self.test_env.close() + self.init = False + + return outputs_stream_dict + else: + outputs_stream_dict["signals:done_training"] = False + + return copy.deepcopy(outputs_stream_dict) + + + + diff --git a/regym/modules/message_trajectory_mutual_information_metric_module.py b/regym/modules/message_trajectory_mutual_information_metric_module.py new file mode 100644 index 00000000..2042d363 --- /dev/null +++ b/regym/modules/message_trajectory_mutual_information_metric_module.py @@ -0,0 +1,186 @@ +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.optim as optim + +import numpy as np +import copy + +from .module import Module + +from comaze_gym.metrics import MessageTrajectoryMutualInformationMetric, RuleBasedMessagePolicy + + +def build_MessageTrajectoryMutualInformationMetricModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ) -> Module: + return MessageTrajectoryMutualInformationMetricModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class MessageTrajectoryMutualInformationMetricModule(Module): + def __init__( + self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ): + """ + Computes multi-step CIC metric and maintains a few elements + necessary to the computation, for 2-players alternating (not simultaneous) games. + """ + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":"modules:environment_module:trajectories", + "filtering_signal":"modules:environment_module:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + + # "observations":"modules:environment_module:observations", + # "infos":"modules:environment_module:info", + # "actions":"modules:environment_module:actions", + # "dones":"modules:environment_module:done", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(MessageTrajectoryMutualInformationMetricModule, self).__init__( + id=id, + type="MessageTrajectoryMutualInformationMetricModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.biasing = self.config.get('biasing', False) + self.nbr_players = self.config.get('nbr_players', 2) + self.player_id = self.config.get('player_id', 0) + self.metric = self.config.get('metric', None) + + # inputs to the agents at each timestep + self.observations = [] + self.infos = [] + # outputs/actions taken when info and obs were seen: + self.actions = [] + self.dones = [] + + self.iteration = 0 + self.sampling_fraction = 5 + self.sampling_period = 10.0 + + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + if self.metric is None: + self.agents = input_streams_dict["current_agents"].agents + self.metric = MultiStepCIC( + action_policy=RuleBasedMessagePolicy( + wrapped_rule_based_agent=self.agents[self.player_id], + combined_action_space=False, + ), + action_policy_bar=None, #deepcopy... + ) + + + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + epoch = input_streams_dict["epoch"] + + filtering_signal = input_streams_dict["filtering_signal"] + trajectories = input_streams_dict["trajectories"] + compute = True + + self.iteration += 1 + #if (compute and np.random.random() < 1.0/self.sampling_period) or filtering_signal: + if (compute and (self.iteration % self.sampling_period) == 0) or filtering_signal: + if filtering_signal: + self.actions = [ + [ + exp[1] + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, (, keys if 'sad'==True) 1) + + # Formatting of kwargs: + # - 'state': observations + # - 'infos': infos + self.x = [ + [ + { + 'state':exp[0], # for _ in range(self.nbr_players)], # see environment_module for indices... + 'infos':[exp[6]], # for _ in range(self.nbr_players)], + 'as_logit':True, + } + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, keys:values) + + x = self.x + + else: + if not hasattr(self, 'x'): + return outputs_stream_dict + + #indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//10, replace=False) + if filtering_signal: + indices = list(range(len(self.x))) + else: + indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//self.sampling_fraction, replace=False) + + x = [traj for idx, traj in enumerate(self.x) if idx in indices] + + batch_size = len(x) + T = max([len(traj) for traj in x]) + #mask = torch.ones((batch_size, T)) + mask = torch.zeros((batch_size, T)) + for actor_id in range(batch_size): + for t in range(len(x[actor_id])): + mask[actor_id][t] = (x[actor_id][t]['infos'][0]["current_player"].item()==self.player_id) + + ## Measure: + L_ps, averaged_m_policy_entropy, \ + exp_ent_pi_m_x_it_over_x_it = self.metric.compute_pos_sign_loss( + x=x, + mask=mask, + biasing=self.biasing, + ) + # batch_size + + mutual_info_m_x_it = averaged_m_policy_entropy - exp_ent_pi_m_x_it_over_x_it + # (1 x 1) + + logs_dict[f"{mode}/{self.id}/AverageMessagePolicyEntropy/{'Eval' if filtering_signal else 'Sample'}"] = averaged_m_policy_entropy + logs_dict[f"{mode}/{self.id}/MutualInformationMessageTrajectory/{'Eval' if filtering_signal else 'Sample'}"] = mutual_info_m_x_it + + if self.biasing: + losses_dict = input_streams_dict["losses_dict"] + losses_dict[f"{mode}/{self.id}/PositiveSignallingLoss/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_ps] + else: + logs_dict[f"{mode}/{self.id}/PositiveSignallingLoss/{'Eval' if filtering_signal else 'Sample'}"] = L_ps.cpu() + + return outputs_stream_dict + diff --git a/regym/modules/module.py b/regym/modules/module.py new file mode 100644 index 00000000..97dad077 --- /dev/null +++ b/regym/modules/module.py @@ -0,0 +1,41 @@ +from typing import Dict, List + +import torch +import torch.nn as nn + + +class Module(nn.Module): + def __init__(self, + id:str, + type:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]): + super(Module, self).__init__() + self.id = id + self.type = type + self.config = config + self.input_stream_ids = input_stream_ids + + def get_id(self) -> str: + return self.id + + def get_type(self) -> str: + return self.type + + def get_input_stream_ids(self) -> Dict[str,str]: + return self.input_stream_ids + + def compute(self, inputs_dict:Dict[str,object]) -> Dict[str,object] : + """ + Operates on inputs_dict that is made up of referents to the available stream. + Make sure that accesses to its element are non-destructive. + + :param inputs_dict: dict of str and data elements that + follows `self.input_stream_ids`'s keywords + and are extracted from `self.input_stream_keys` + -named streams. + + :returns: + - outputs_sream_dict: + """ + raise NotImplementedError \ No newline at end of file diff --git a/regym/modules/multi_reconstruction_from_hidden_state_module.py b/regym/modules/multi_reconstruction_from_hidden_state_module.py new file mode 100644 index 00000000..876041bf --- /dev/null +++ b/regym/modules/multi_reconstruction_from_hidden_state_module.py @@ -0,0 +1,315 @@ +from typing import Dict, List, Any + +import torch +import torch.nn as nn +import torch.optim as optim + +import numpy as np +import copy + +from .module import Module + +def build_MultiReconstructionFromHiddenStateModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ) -> Module: + return MultiReconstructionFromHiddenStateModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class MultiReconstructionFromHiddenStateModule(Module): + def __init__( + self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ): + """ + "build_signal_to_reconstruct_from_trajectory_fn": + lambda traj, player_id: return List[torch.Tensor] + + "signal_to_reconstruct": "None", + """ + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "trajectories":"modules:marl_environment_module:trajectories", + "filtering_signal":"modules:marl_environment_module:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(MultiReconstructionFromHiddenStateModule, self).__init__( + id=id, + type="MultiReconstructionFromHiddenStateModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.biasing = self.config.get('biasing', False) + self.nbr_players = self.config.get('nbr_players', 2) + self.player_id = self.config.get('player_id', 0) + + self.iteration = 0 + self.sampling_fraction = 5 + self.sampling_period = 10.0 + + self.hiddenstate_policy = self.config["hiddenstate_policy"] + self.hidden_state_dim = self.hiddenstate_policy.get_hidden_state_dim() + + self.nbr_rec = len(self.config['rec_dicts']) + self.build_signal_to_reconstruct_from_trajectory_fn = {} + self.criterion = {} + self.signal_to_reconstruct_dim = {} + self.prediction_net = nn.ModuleList() + for rec_name, rec_dict in self.config['rec_dicts'].items(): + if "build_signal_to_reconstruct_from_trajectory_fn" in rec_dict: + self.build_signal_to_reconstruct_from_trajectory_fn[rec_name] = rec_dict["build_signal_to_reconstruct_from_trajectory_fn"] + self.criterion[rec_name] = torch.nn.BCEWithLogitsLoss(reduction='none') + self.signal_to_reconstruct_dim[rec_name] = rec_dict["signal_to_reconstruct_dim"] + + prediction_net = [ + nn.Linear(self.hidden_state_dim, 512), + nn.ReLU(), + nn.Linear(512, self.signal_to_reconstruct_dim[rec_name]), + ] + self.prediction_net.append(nn.Sequential(*prediction_net)) + + if self.config['use_cuda']: + self = self.cuda() + + print(self.prediction_net) + + def parameters(self): + return self.prediction_net.parameters() + + def compute_reconstruction_loss( + self, + x:List[List[Any]], + y:Dict[str, List[List[torch.Tensor]]], + mask:List[List[Any]]=None, + biasing:bool=False, + ) -> Dict[str,torch.Tensor]: + """ + WARNING: this function resets the :attr hiddenstate_policy:! + Beware of potentially erasing agent's current's internal states + + :param x: + List[List[object]] containing, for each actor, at each time step t an object + representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the :attr hiddenstate_policy:. + + :param y: + Dict[str, List[List[torch.Tensor]]] where each entry corresponds to a reconstruction entry name, + and the values are containing, for each actor, at each time step t an object + representing the signal to reconstruct. + Shape: signal_to_reconstruct_dim. + + :param mask: + List[List[object]] containing, for each actor, at each time step t an object + with batch_size dimensions and whose values are either + 1 or 0. For all actor b, mask[b]==1 if and only if + the experience in x[t] is valid (e.g. episode not ended). + """ + batch_size = len(x) + self.iteration += 1 + + nbr_actors = self.hiddenstate_policy.get_nbr_actor() + + if biasing: + hiddenstate_policy = self.hiddenstate_policy + self.hiddenstate_policy.save_inner_state() + else: + hiddenstate_policy = self.hiddenstate_policy.clone() + + L_rec = {rec_name:torch.zeros(batch_size) for rec_name in self.config['rec_dicts']} + L_mse = {rec_name:torch.zeros(batch_size) for rec_name in self.config['rec_dicts']} + per_actor_per_t_per_dim_acc = {rec_name:[[] for _ in range(batch_size)] for rec_name in self.config['rec_dicts']} + per_actor_rec_accuracy = {rec_name:torch.zeros(batch_size) for rec_name in self.config['rec_dicts']} + + for actor_id in range(batch_size): + hiddenstate_policy.reset(1) + T = len(x[actor_id]) + if mask is None: + eff_mask = torch.ones((batch_size, T)) + else: + eff_mask = mask + + for t in range(T): + m = eff_mask[actor_id][t] + + if biasing: + hs_t = hiddenstate_policy(x[actor_id][t]) + # 1 x hidden_state_dim + else: + with torch.no_grad(): + hs_t = hiddenstate_policy(x[actor_id][t]).detach() + # 1 x hidden_state_dim + + m = m.to(hs_t.device) + + for rec_idx, rec_name in enumerate(y): + labels = y[rec_name][actor_id][t] + # in range [0,1] + # 1xsignal_to_reconstruct_dim + + logit_pred = self.prediction_net[rec_idx](hs_t.reshape(1,-1)) + + if labels.device != logit_pred.device: labels = labels.to(logit_pred.device) + ### + pred = torch.sigmoid(logit_pred) + # 1x dim + if 'accuracy_pre_process_fn' not in self.config['rec_dicts'][rec_name]: + per_dim_acc_t = (((pred-5e-2<=labels).float()+(pred+5e-2>=labels)).float()>=2).float() + else: + per_dim_acc_t = self.config['rec_dicts'][rec_name]['accuracy_pre_process_fn'](pred=pred, target=labels) + # 1x dim + per_actor_per_t_per_dim_acc[rec_name][actor_id].append(per_dim_acc_t) + ### + + L_rec_t = self.criterion[rec_name]( + input=logit_pred, + target=labels.detach(), + ).mean() + # 1 + L_mse_t = 0.5*torch.pow(pred-labels, 2.0).mean() + # 1 + + if L_rec[rec_name].device != L_rec_t.device: L_rec[rec_name] = L_rec[rec_name].to(L_rec_t.device) + if L_mse[rec_name].device != L_mse_t.device: L_mse[rec_name] = L_mse[rec_name].to(L_mse_t.device) + + L_rec[rec_name][actor_id:actor_id+1] += m*L_rec_t.reshape(-1) + # batch_size + L_mse[rec_name][actor_id:actor_id+1] += m*L_mse_t.reshape(-1) + + for rec_name in self.config['rec_dicts']: + per_actor_per_t_per_dim_acc[rec_name][actor_id] = torch.cat(per_actor_per_t_per_dim_acc[rec_name][actor_id], dim=0) + # timesteps x nbr_goal + per_actor_rec_accuracy[rec_name][actor_id] = per_actor_per_t_per_dim_acc[rec_name][actor_id].mean()*100.0 + ### + + if biasing: + self.hiddenstate_policy.reset(nbr_actors, training=True) + self.hiddenstate_policy.restore_inner_state() + + output_dict = { + 'l_rec':L_rec, + 'l_mse':L_mse, + 'per_actor_rec_accuracy':per_actor_rec_accuracy, + } + + return output_dict + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + epoch = input_streams_dict["epoch"] + + filtering_signal = input_streams_dict["filtering_signal"] + trajectories = input_streams_dict["trajectories"] + compute = True + + self.iteration += 1 + #if (compute and np.random.random() < 1.0/self.sampling_period) or filtering_signal: + if (compute and (self.iteration % self.sampling_period) == 0) or filtering_signal: + if filtering_signal: + self.actions = [ + [ + exp[1] + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, (, keys if 'sad'==True) 1) + + # Formatting of kwargs: + # - 'state': observations + # - 'infos': infos + self.x = [ + [ + { + 'state':exp[0], # for _ in range(self.nbr_players)], # see environment_module for indices... + 'infos':[exp[6]], # for _ in range(self.nbr_players)], + 'as_logit':True, + } + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, keys:values) + + self.labels = {} + for rec_idx, rec_name in enumerate(self.config['rec_dicts']): + self.labels[rec_name] = [ + self.build_signal_to_reconstruct_from_trajectory_fn[rec_name]( + traj=traj, + player_id=self.player_id, + ) + for traj in trajectories + ] + + x = self.x + labels = self.labels + else: + if not hasattr(self, 'x'): + return outputs_stream_dict + + if filtering_signal: + indices = list(range(len(self.x))) + else: + indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//self.sampling_fraction, replace=False) + + x = [traj for idx, traj in enumerate(self.x) if idx in indices] + labels_dict = {} + for rec_name in self.labels: + labels_dict[rec_name] = [labels for idx, labels in enumerate(self.labels[rec_name]) if idx in indices] + + mask = None + + ## Measure: + output_dict = self.compute_reconstruction_loss( + x=x, + y=labels_dict, + mask=mask, + biasing=self.biasing, + ) + + L_rec = output_dict['l_rec'] + # dict nbr_rec x batch_size + rec_accuracy = output_dict['per_actor_rec_accuracy'] + # dict nbr_rec x batch_size + + L_mse = output_dict['l_mse'] + + losses_dict = input_streams_dict["losses_dict"] + + for rec_name in self.config['rec_dicts']: + logs_dict[f"{mode}/{self.id}/{rec_name}/ReconstructionAccuracy/{'Eval' if filtering_signal else 'Sample'}"] = rec_accuracy[rec_name].mean() + #logs_dict[f"{mode}/{self.id}/{rec_name}/ReconstructionMSELoss/{'Eval' if filtering_signal else 'Sample'}"] = L_mse[rec_name].mean() + logs_dict[f"{mode}/{self.id}/{rec_name}/ReconstructionLoss/Log/BCE/{'Eval' if filtering_signal else 'Sample'}"] = L_rec[rec_name].mean() + + + #losses_dict[f"{mode}/{self.id}/{rec_name}/ReconstructionLoss/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_rec[rec_name]] + losses_dict[f"{mode}/{self.id}/{rec_name}/ReconstructionLoss/MSE/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_mse[rec_name]] + + return outputs_stream_dict + diff --git a/regym/modules/multi_step_cic_metric_module.py b/regym/modules/multi_step_cic_metric_module.py new file mode 100644 index 00000000..16263d49 --- /dev/null +++ b/regym/modules/multi_step_cic_metric_module.py @@ -0,0 +1,238 @@ +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.optim as optim + +import numpy as np +import copy + +from .module import Module + +from comaze_gym.metrics import MultiStepCIC, RuleBasedActionPolicy + + +def build_MultiStepCICMetricModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ) -> Module: + return MultiStepCICMetricModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class MultiStepCICMetricModule(Module): + def __init__( + self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ): + """ + Computes multi-step CIC metric and maintains a few elements + necessary to the computation, for 2-players alternating (not simultaneous) games. + """ + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "vocab_size":"config:vocab_size", + "max_sentence_length":"config:max_sentence_length", + + "trajectories":"modules:environment_module:trajectories", + "filtering_signal":"modules:environment_module:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + + # "observations":"modules:environment_module:observations", + # "infos":"modules:environment_module:info", + # "actions":"modules:environment_module:actions", + # "dones":"modules:environment_module:done", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(MultiStepCICMetricModule, self).__init__( + id=id, + type="MultiStepCICMetricModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.biasing = self.config.get('biasing', False) + self.nbr_players = self.config.get('nbr_players', 2) + self.player_id = self.config.get('player_id', 0) + self.metric = self.config.get('metric', None) + + self.iteration = 0 + self.sampling_fraction = 5 + self.sampling_period = 10.0 + + def message_zeroing_out_fn( + x, + msg_key="communication_channel", + #paths_to_msg=[["infos",pidx] for pidx in range(self.nbr_players)], + paths_to_msg=[["infos",0]], + + ): + xp = copy.deepcopy(x) + for actor_id in range(len(xp)): + for t in range(len(xp[actor_id])): + pointer = xp[actor_id][t] + for path_to_msg in paths_to_msg: + for child_node in path_to_msg: + pointer = pointer[child_node] + + msg = pointer[msg_key] + if isinstance(msg, List): + zeroed_out_msg = [np.zeros_like(item) for item in msg] + else: + zeroed_out_msg = np.zeros_like(msg) + pointer[msg_key] = zeroed_out_msg + + pointer = xp[actor_id][t] + + return xp + + self.message_zeroing_out_fn = self.config.get('message_zeroing_out_fn', message_zeroing_out_fn) + + # inputs to the agents at each timestep + self.observations = [] + self.infos = [] + # outputs/actions taken when info and obs were seen: + self.actions = [] + self.dones = [] + + + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + if self.metric is None: + self.agents = input_streams_dict["current_agents"].agents + self.metric = MultiStepCIC( + action_policy=RuleBasedActionPolicy( + wrapped_rule_based_agent=self.agents[self.player_id], + combined_action_space=False, + ), + action_policy_bar=None, #deepcopy... + ) + + + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + epoch = input_streams_dict["epoch"] + + filtering_signal = input_streams_dict["filtering_signal"] + trajectories = input_streams_dict["trajectories"] + compute = True + + self.iteration += 1 + #if (compute and np.random.random() < 1.0/self.sampling_period) or filtering_signal: + if (compute and (self.iteration % self.sampling_period) == 0) or filtering_signal: + if filtering_signal: + self.actions = [ + [ + exp[1] + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, (, keys if 'sad'==True) 1) + + # Formatting of kwargs: + # - 'state': observations + # - 'infos': infos + self.x = [ + [ + { + 'state':exp[0], # for _ in range(self.nbr_players)], # see environment_module for indices... + 'infos':[exp[6]], # for _ in range(self.nbr_players)], + 'as_logit':True, + } + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, keys:values) + + self.xp = self.message_zeroing_out_fn(self.x) + self.a = self.actions + + x = self.x + xp = self.xp + a = self.a + + else: + if not hasattr(self, 'xp'): + return outputs_stream_dict + + if filtering_signal: + indices = list(range(len(self.x))) + else: + indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//self.sampling_fraction, replace=False) + + x = [traj for idx, traj in enumerate(self.x) if idx in indices] + xp = [traj for idx, traj in enumerate(self.xp) if idx in indices] + a = [traj for idx, traj in enumerate(self.a) if idx in indices] + + + batch_size = len(x) + T = max([len(traj) for traj in x]) + #mask = torch.ones((batch_size, T)) + mask = torch.zeros((batch_size, T)) + for actor_id in range(batch_size): + for t in range(len(x[actor_id])): + mask[actor_id][t] = (x[actor_id][t]['infos'][0]["current_player"].item()==self.player_id) + + ## Measure: + L_pl = self.metric.compute_pos_lis_loss( + x=x, + #xp=x, #debug + xp=xp, + mask=mask, + biasing=self.biasing, + ) + + ms_cic = self.metric.compute_multi_step_cic( + x=x, + #xp=x, #debug + xp=xp, + mask=mask + ) + + ## Training: + + L_ce, prediction_accuracy = self.metric.train_unconditioned_policy( + x=x, + #xp=x, #debug + xp=xp, + mask=mask, + #a=a, #using actual action is risky given the exploration policy, if on training trajectories... + ) + # batch_size + + logs_dict[f"{mode}/{self.id}/multi_step_CIC/{'Eval' if filtering_signal else 'Sample'}"] = ms_cic.cpu() + logs_dict[f"{mode}/{self.id}/UnconditionedPolicyFitting/CrossEntropyLoss/{'Eval' if filtering_signal else 'Sample'}"] = L_ce.cpu() + logs_dict[f"{mode}/{self.id}/UnconditionedPolicyFitting/PredictionAccuracy/{'Eval' if filtering_signal else 'Sample'}"] = prediction_accuracy.cpu() + + if self.biasing: + losses_dict = input_streams_dict["losses_dict"] + losses_dict[f"{mode}/{self.id}/PositiveListeningLoss/{'Eval' if filtering_signal else 'Sample'}"] = [0.0001, L_pl] + else: + logs_dict[f"{mode}/{self.id}/PositiveListeningLoss/{'Eval' if filtering_signal else 'Sample'}"] = L_pl.cpu() + + return outputs_stream_dict + diff --git a/regym/modules/optimization_module.py b/regym/modules/optimization_module.py new file mode 100644 index 00000000..9c3771a7 --- /dev/null +++ b/regym/modules/optimization_module.py @@ -0,0 +1,171 @@ +from typing import Dict, List + +import os + +import torch +import torch.nn as nn +import torch.optim as optim + +from regym.modules import Module + +def hasnan(tensor): + if torch.isnan(tensor).max().item() == 1: + return True + return False + +def reg_nan(param, verbose=False): + if param is None or param.data is None: return param + nan_indices = torch.isnan(param.data) + if verbose and torch.any(nan_indices).item(): + print("WARNING: NaN found in {}.".format(param)) + param.data[nan_indices] = 0 + if param.grad is None: return param + nan_indices = torch.isnan(param.grad.data) + if verbose and torch.any(nan_indices).item(): + print("WARNING: NaN found in the GRADIENT of {}.".format(param)) + param.grad.data[nan_indices] = 0 + return param + +def handle_nan(layer, verbose=True): + for name, param in layer._parameters.items(): + if param is None or param.data is None: continue + nan_indices = torch.isnan(layer._parameters[name].data) + if verbose and torch.any(nan_indices).item(): + print("WARNING: NaN found in {} of {}.".format(name, layer)) + layer._parameters[name].data[nan_indices] = 0 + if param.grad is None: continue + nan_indices = torch.isnan(layer._parameters[name].grad.data) + if verbose and torch.any(nan_indices).item(): + print("WARNING: NaN found in the GRADIENT of {} of {}.".format(name, layer)) + layer._parameters[name].grad.data[nan_indices] = 0 + +#TODO: +""" +1) Maybe make it possible for this module to ignore some task loss: +--> implement a mask-based policy? +""" + +def build_OptimizationModule(id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None) -> Module: + return OptimizationModule(id=id, + config=config, + input_stream_ids=input_stream_ids) + + +class OptimizationModule(Module): + def __init__(self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None): + + if input_stream_ids is None: + input_stream_ids = { + "losses_dict":"losses_dict", + "logs_dict":"logs_dict", + "mode":"signals:mode", + "it_sample":"signals:it_sample", + # step in the sequence of repetitions of the current batch + "it_step":"signals:it_step", + # step in the communication round. + } + + assert "modules" in config,\ + "OptimizationModule relies on list of modules.\n\ + Not found in config." + + assert "optimizer_type" in config,\ + "OptimizationModule relies on 'optimizer_type'.\n\ + Not found in config." + + assert "mode" in input_stream_ids.keys(),\ + "OptimizationModule relies on 'mode' id.\n\ + Not found in input_stream_ids." + + assert "losses_dict" in input_stream_ids.keys(),\ + "OptimizationModule relies on 'losses_dict' id.\n\ + Not found in input_stream_ids." + + assert "logs_dict" in input_stream_ids.keys(),\ + "OptimizationModule relies on 'logs_dict' id.\n\ + Not found in input_stream_ids." + + super(OptimizationModule, self).__init__(id=id, + type="OptimizationModule", + config=config, + input_stream_ids=input_stream_ids) + self.update_count = 0 + parameters = [] + for k,m in self.config["modules"].items(): + parameters += m.parameters() + print(k) + + if len(list(parameters)): + if "sgd" in self.config["optimizer_type"].lower(): + self.optimizer = optim.SGD(parameters, + lr=self.config["learning_rate"]) + else: + self.optimizer = optim.Adam(parameters, + lr=self.config["learning_rate"], + #betas=(0.9, 0.999), + eps=self.config["adam_eps"]) + else: + self.optimizer = None + + def save(self, path): + if self.optimizer is not None: + torch.save(self.optimizer.state_dict(), os.path.join(path, self.id+".module")) + + def load(self, path): + if self.optimizer is not None: + self.optimizer.load_state_dict(torch.load(os.path.join(path, self.id+".module"))) + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + Operates on inputs_dict that is made up of referents to the available stream. + Make sure that accesses to its element are non-destructive. + + :param input_streams_dict: dict of str and data elements that + follows `self.input_stream_ids`'s keywords and are extracted + from `self.input_stream_keys`-named streams. + + :returns: + - outputs_stream_dict: + """ + outputs_stream_dict = {} + + losses_dict = input_streams_dict["losses_dict"] + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + + it_rep = input_streams_dict["it_sample"] + it_comm_round = input_streams_dict["it_step"] + + for l_name, l in losses_dict.items(): + logs_dict[f"{mode}/{l_name}"] = l[-1] + + for l_name, l in losses_dict.items(): + losses_dict[l_name] = l[0]*l[-1] + + loss = sum([l.mean() for l in losses_dict.values()]) + + if "train" in mode\ + and len(losses_dict)\ + and self.optimizer is not None: + self.optimizer.zero_grad() + loss.backward() + + for k,m in self.config["modules"].items(): + m.apply(handle_nan) + if self.config["with_gradient_clip"]: + nn.utils.clip_grad_value_(m.parameters(), self.config["gradient_clip"]) + + self.optimizer.step() + self.update_count += 1 + + logs_dict[f"{mode}/repetition{it_rep}/comm_round{it_comm_round}/Loss"] = loss + + outputs_stream_dict['signals:update_count'] = self.update_count + + return outputs_stream_dict + diff --git a/regym/modules/per_epoch_logger_module.py b/regym/modules/per_epoch_logger_module.py new file mode 100644 index 00000000..fb994e14 --- /dev/null +++ b/regym/modules/per_epoch_logger_module.py @@ -0,0 +1,187 @@ +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.optim as optim + +import numpy as np + +from .module import Module + +def build_PerEpochLoggerModule(id:str, + config:Dict[str,object]=None, + input_stream_ids:Dict[str,str]=None) -> Module: + return PerEpochLoggerModule(id=id, + config=config, + input_stream_ids=input_stream_ids) + + +class PerEpochLoggerModule(Module): + def __init__(self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None): + + if input_stream_ids is None: + input_stream_ids = { + "logger":"modules:logger:ref", + "losses_dict":"losses_dict", + "logs_dict":"logs_dict", + "epoch":"signals:epoch", + "update_count":"signals:update_count", + "mode":"signals:mode", + "end_of_dataset":"signals:end_of_dataset", + # boolean: whether the current batch/datasample is the last of the current dataset/mode. + "global_it_datasample":"signals:it_datasample", + "it_datasample":"signals:it_datasample", + "end_of_repetition_sequence":"signals:end_of_repetition_sequence", + # boolean: whether the current sample(observation from the agent of the current batch/datasample) + # is the last of the current sequence of repetition. + "global_it_sample":"signals:global_it_sample", + "it_sample":"signals:it_sample", + # step in the sequence of repetitions of the current batch + "end_of_communication":"signals:end_of_communication", + # boolean: whether the current communication round is the last of + # the current dialog. + "global_it_step":"signals:global_it_step", + "it_step":"signals:it_step", + # step in the communication round. + } + + assert "logger" in input_stream_ids.keys(),\ + "PerEpochLoggerModule relies on 'logger' id.\n\ + Not found in input_stream_ids." + + assert "epoch" in input_stream_ids.keys(),\ + "PerEpochLoggerModule relies on 'epoch' id.\n\ + Not found in input_stream_ids." + + assert "mode" in input_stream_ids.keys(),\ + "PerEpochLoggerModule relies on 'mode' id.\n\ + Not found in input_stream_ids." + + assert "losses_dict" in input_stream_ids.keys(),\ + "PerEpochLoggerModule relies on 'losses_dict' id.\n\ + Not found in input_stream_ids." + + assert "logs_dict" in input_stream_ids.keys(),\ + "PerEpochLoggerModule relies on 'logs_dict' id.\n\ + Not found in input_stream_ids." + + super(PerEpochLoggerModule, self).__init__(id=id, + type="PerEpochLoggerModule", + config=config, + input_stream_ids=input_stream_ids) + + self.storages = {} + + self.epoch = 0 + self.end_of_ = [key for key,value in input_stream_ids.items() if "end_of_" in key] + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + losses_dict = input_streams_dict["losses_dict"] + logs_dict = input_streams_dict["logs_dict"] + + epoch = input_streams_dict["epoch"] + update_count = input_streams_dict["update_count"] + mode = input_streams_dict["mode"] + global_it_step = input_streams_dict["global_it_step"] + + logger = input_streams_dict["logger"] + + # Store new data: + for key,value in logs_dict.items(): + if key not in self.storages: + self.storages[key] = [] + if isinstance(value, torch.Tensor): + value = value.cpu().detach() + self.storages[key].append(value) + + # Is it the end of the epoch? + end_of_epoch = all([ + input_streams_dict[key] + for key in self.end_of_] + ) + + # If so, let us average over every value and save it: + if end_of_epoch or self.epoch != epoch: + self.epoch = epoch + for key, valuelist in self.storages.items(): + need_stats = False + if isinstance(valuelist[0], torch.Tensor):# and len(valuelist[0].shape)>=1: + values = torch.cat([vl.cpu().detach().reshape(-1) for vl in valuelist], dim=0).numpy() + need_stats = True + elif isinstance(valuelist[0], float) or isinstance(valuelist[0], int): + values = np.asarray(valuelist).reshape(-1) + if len(valuelist)>1: + need_stats = True + else: + continue + + if need_stats: + averaged_value = values.mean() + std_value = values.std() + logger.add_scalar(f"PerEpoch/{key}/Mean", averaged_value, epoch) + logger.add_scalar(f"PerEpoch/{key}/Std", std_value, epoch) + + logger.add_scalar(f"PerUpdate/{key}/Mean", averaged_value, update_count) + logger.add_scalar(f"PerUpdate/{key}/Std", std_value, update_count) + + + median_value = np.nanpercentile( + values, + q=50, + axis=None, + interpolation="nearest" + ) + q1_value = np.nanpercentile( + values, + q=25, + axis=None, + interpolation="lower" + ) + q3_value = np.nanpercentile( + values, + q=75, + axis=None, + interpolation="higher" + ) + iqr = q3_value-q1_value + + logger.add_scalar(f"PerEpoch/{key}/Median", median_value, epoch) + logger.add_scalar(f"PerEpoch/{key}/Q1", q1_value, epoch) + logger.add_scalar(f"PerEpoch/{key}/Q3", q3_value, epoch) + logger.add_scalar(f"PerEpoch/{key}/IQR", iqr, epoch) + + logger.add_scalar(f"PerUpdate/{key}/Median", median_value, update_count) + logger.add_scalar(f"PerUpdate/{key}/Q1", q1_value, update_count) + logger.add_scalar(f"PerUpdate/{key}/Q3", q3_value, update_count) + logger.add_scalar(f"PerUpdate/{key}/IQR", iqr, update_count) + + #logger.add_histogram(f"PerEpoch/{key}", values, epoch) + else: + logger.add_scalar(f"PerEpoch/{key}", valuelist[-1], epoch) + logger.add_scalar(f"PerUpdate/{key}", valuelist[-1], update_count) + + # Remove the value form the logs_dict if it is present: + logs_dict.pop(key, None) + + # Reset epoch storages: + self.storages = {} + + # Flush data: + logger.flush() + + + # Log new (rectified) data: + for key,value in logs_dict.items(): + if isinstance(value, torch.Tensor): + value = value.mean().item() + logger.add_scalar(key, value, global_it_step) + + return {} + \ No newline at end of file diff --git a/regym/modules/reconstruction_from_hidden_state_module.py b/regym/modules/reconstruction_from_hidden_state_module.py new file mode 100644 index 00000000..9ac3e262 --- /dev/null +++ b/regym/modules/reconstruction_from_hidden_state_module.py @@ -0,0 +1,310 @@ +from typing import Dict, List, Any + +import torch +import torch.nn as nn +import torch.optim as optim + +import numpy as np +import copy + +from .module import Module + +def build_ReconstructionFromHiddenStateModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ) -> Module: + return ReconstructionFromHiddenStateModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class ReconstructionFromHiddenStateModule(Module): + def __init__( + self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None + ): + """ + "build_signal_to_reconstruct_from_trajectory_fn": + lambda traj, player_id: return List[torch.Tensor] + + "signal_to_reconstruct": "None", + """ + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "trajectories":"modules:marl_environment_module:trajectories", + "filtering_signal":"modules:marl_environment_module:new_trajectories_published", + + "current_agents":"modules:current_agents:ref", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(ReconstructionFromHiddenStateModule, self).__init__( + id=id, + type="ReconstructionFromHiddenStateModule", + config=config, + input_stream_ids=input_stream_ids + ) + + self.biasing = self.config.get('biasing', False) + self.nbr_players = self.config.get('nbr_players', 2) + self.player_id = self.config.get('player_id', 0) + + self.iteration = 0 + self.sampling_fraction = 5 + self.sampling_period = 10.0 + + if "build_signal_to_reconstruct_from_trajectory_fn" in self.config: + self.build_signal_to_reconstruct_from_trajectory_fn = self.config["build_signal_to_reconstruct_from_trajectory_fn"] + self.criterion = torch.nn.BCEWithLogitsLoss(reduction='none') + self.signal_to_reconstruct_dim = self.config["signal_to_reconstruct_dim"] + self.hiddenstate_policy = self.config["hiddenstate_policy"] + self.hidden_state_dim = self.hiddenstate_policy.get_hidden_state_dim() + self.prediction_net = [ + nn.Linear(self.hidden_state_dim, 512), + nn.ReLU(), + nn.Linear(512, self.signal_to_reconstruct_dim), + ] + self.prediction_net = nn.Sequential(*self.prediction_net) + + if self.config['use_cuda']: + self = self.cuda() + + print(self.prediction_net) + + def parameters(self): + return self.prediction_net.parameters() + + def compute_reconstruction_loss( + self, + x:List[List[Any]], + y:List[List[torch.Tensor]], + mask:List[List[Any]]=None, + biasing:bool=False, + ) -> Dict[str,torch.Tensor]: + """ + WARNING: this function resets the :attr hiddenstate_policy:! + Beware of potentially erasing agent's current's internal states + + :param x: + List[List[object]] containing, for each actor, at each time step t an object + representing the observation of the current agent. + e.g.: the object can be a kwargs argument containing + expected argument to the :attr hiddenstate_policy:. + + :param y: + List[List[torch.Tensor]] containing, for each actor, at each time step t an object + representing the signal to reconstruct. + Shape: signal_to_reconstruct_dim. + + :param mask: + List[List[object]] containing, for each actor, at each time step t an object + with batch_size dimensions and whose values are either + 1 or 0. For all actor b, mask[b]==1 if and only if + the experience in x[t] is valid (e.g. episode not ended). + """ + batch_size = len(x) + self.iteration += 1 + + nbr_actors = self.hiddenstate_policy.get_nbr_actor() + + if biasing: + hiddenstate_policy = self.hiddenstate_policy + self.hiddenstate_policy.save_inner_state() + else: + hiddenstate_policy = self.hiddenstate_policy.clone() + + L_rec = torch.zeros(batch_size) + L_mse = torch.zeros(batch_size) + per_actor_per_t_per_dim_acc = [[] for _ in range(batch_size)] + per_actor_rec_accuracy = torch.zeros(batch_size) + + dataframe_dict = { + 'actor_id': [], + 'timestep': [], + } + + for actor_id in range(batch_size): + hiddenstate_policy.reset(1) + labels_list = y[actor_id] + # in range [0,1] + # 1xsignal_to_reconstruct_dim + + T = len(x[actor_id]) + if mask is None: + eff_mask = torch.ones((batch_size, T)) + else: + eff_mask = mask + + for t in range(T): + m = eff_mask[actor_id][t] + labels = labels_list[t] + if biasing: + hs_t = hiddenstate_policy(x[actor_id][t]) + # 1 x hidden_state_dim + else: + with torch.no_grad(): + hs_t = hiddenstate_policy(x[actor_id][t]).detach() + # 1 x hidden_state_dim + + logit_pred = self.prediction_net(hs_t.reshape(1,-1)) + + m = m.to(hs_t.device) + if labels.device != logit_pred.device: labels = labels.to(logit_pred.device) + ### + pred = torch.sigmoid(logit_pred) + # 1x dim + if 'accuracy_pre_process_fn' not in self.config: + per_dim_acc_t = (((pred-5e-2<=labels).float()+(pred+5e-2>=labels)).float()>=2).float() + else: + per_dim_acc_t = self.config['accuracy_pre_process_fn'](pred=pred, target=labels) + # 1x dim + per_actor_per_t_per_dim_acc[actor_id].append(per_dim_acc_t) + ### + + L_rec_t = self.criterion( + input=logit_pred, + target=labels.detach(), + ).mean() + # 1 + L_mse_t = 0.5*torch.pow(pred-labels, 2.0).mean() + # 1 + + if L_rec.device != L_rec_t.device: L_rec = L_rec.to(L_rec_t.device) + if L_mse.device != L_mse_t.device: L_mse = L_mse.to(L_mse_t.device) + + L_rec[actor_id:actor_id+1] += m*L_rec_t.reshape(-1) + # batch_size + L_mse[actor_id:actor_id+1] += m*L_mse_t.reshape(-1) + + dataframe_dict['actor_id'].append(actor_id) + dataframe_dict['timestep'].append(t) + + per_actor_per_t_per_dim_acc[actor_id] = torch.cat(per_actor_per_t_per_dim_acc[actor_id], dim=0) + # timesteps x nbr_goal + #correct_pred_indices = torch.nonzero((per_actor_per_t_per_dim_acc[actor_id].sum(dim=-1)==self.signal_to_reconstruct_dim).float()) + #per_actor_rec_accuracy[actor_id] = correct_pred_indices.shape[0]/T*100.0 + per_actor_rec_accuracy[actor_id] = per_actor_per_t_per_dim_acc[actor_id].mean()*100.0 + ### + + if biasing: + self.hiddenstate_policy.reset(nbr_actors, training=True) + self.hiddenstate_policy.restore_inner_state() + + output_dict = { + 'l_rec':L_rec, + 'l_mse':L_mse, + 'per_actor_rec_accuracy':per_actor_rec_accuracy, + } + + return output_dict + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_stream_dict = {} + + logs_dict = input_streams_dict["logs_dict"] + mode = input_streams_dict["mode"] + epoch = input_streams_dict["epoch"] + + filtering_signal = input_streams_dict["filtering_signal"] + trajectories = input_streams_dict["trajectories"] + compute = True + + self.iteration += 1 + #if (compute and np.random.random() < 1.0/self.sampling_period) or filtering_signal: + if (compute and (self.iteration % self.sampling_period) == 0) or filtering_signal: + if filtering_signal: + self.actions = [ + [ + exp[1] + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, (, keys if 'sad'==True) 1) + + # Formatting of kwargs: + # - 'state': observations + # - 'infos': infos + self.x = [ + [ + { + 'state':exp[0], # for _ in range(self.nbr_players)], # see environment_module for indices... + 'infos':[exp[6]], # for _ in range(self.nbr_players)], + 'as_logit':True, + } + for exp in traj[self.player_id] + ] + for traj in trajectories + ] + # (batch_size, timestep, keys:values) + + if hasattr(self,'build_signal_to_reconstruct_from_trajectory_fn'): + self.labels = [ + self.build_signal_to_reconstruct_from_trajectory_fn( + traj=traj, + player_id=self.player_id, + ) + for traj in trajectories + ] + else: + raise NotImplementedError + + x = self.x + labels = self.labels + else: + if not hasattr(self, 'x'): + return outputs_stream_dict + + if filtering_signal: + indices = list(range(len(self.x))) + else: + indices = np.random.choice(list(range(len(self.x))), size=len(self.x)//self.sampling_fraction, replace=False) + + x = [traj for idx, traj in enumerate(self.x) if idx in indices] + labels = [labels for idx, labels in enumerate(self.labels) if idx in indices] + + mask = None + + ## Measure: + output_dict = self.compute_reconstruction_loss( + x=x, + y=labels, + mask=mask, + biasing=self.biasing, + ) + + L_rec = output_dict['l_rec'] + # batch_size + rec_accuracy = output_dict['per_actor_rec_accuracy'] + # batch_size + + L_mse = output_dict['l_mse'] + + logs_dict[f"{mode}/{self.id}/ReconstructionAccuracy/{'Eval' if filtering_signal else 'Sample'}"] = rec_accuracy.mean() + #logs_dict[f"{mode}/{self.id}/ReconstructionMSELoss/{'Eval' if filtering_signal else 'Sample'}"] = L_mse.mean() + logs_dict[f"{mode}/{self.id}/ReconstructionLoss/Log/BCE/{'Eval' if filtering_signal else 'Sample'}"] = L_rec.mean() + + + losses_dict = input_streams_dict["losses_dict"] + #losses_dict[f"{mode}/{self.id}/ReconstructionLoss/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_rec] + losses_dict[f"{mode}/{self.id}/ReconstructionLoss/MSE/{'Eval' if filtering_signal else 'Sample'}"] = [1.0, L_mse] + + return outputs_stream_dict + diff --git a/regym/modules/rl_agent_module.py b/regym/modules/rl_agent_module.py new file mode 100644 index 00000000..47e67f4a --- /dev/null +++ b/regym/modules/rl_agent_module.py @@ -0,0 +1,108 @@ +from typing import Dict, List, Optional + +import copy + +from regym.modules.module import Module + +def build_RLAgentModule( + id:str, + config:Dict[str,object], + input_stream_ids:Optional[Dict[str,str]]=None) -> Module: + return RLAgentModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class RLAgentModule(Module): + def __init__( + self, + id:str, + config=Dict[str,object], + input_stream_ids:Optional[Dict[str,str]]=None + ): + """ + This is a placeholder for an RL agent. + """ + + default_input_stream_ids = { + "logs_dict":"logs_dict", + "losses_dict":"losses_dict", + "epoch":"signals:epoch", + "mode":"signals:mode", + + "reset_actors":"modules:marl_environment_module:reset_actors", + + "observations":"modules:marl_environment_module:ref:player_0:observations", + "infos":"modules:marl_environment_module:ref:player_0:infos", + "actions":"modules:marl_environment_module:ref:player_0:actions", + "succ_observations":"modules:marl_environment_module:ref:player_0:succ_observations", + "succ_infos":"modules:marl_environment_module:ref:player_0:succ_infos", + "rewards":"modules:marl_environment_module:ref:player_0:rewards", + "dones":"modules:marl_environment_module:ref:player_0:dones", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids: + input_stream_ids[default_id] = default_stream + + super(RLAgentModule, self).__init__( + id=id, + type="RLAgentModule", + config=config, + input_stream_ids=input_stream_ids, + ) + + self.agent = self.config["agent"] + + + def set_agent(self, agent): + self.agent = agent + + def parameters(self): + return self.agent.parameters() + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + outputs_streams_dict = {} + + self.new_observations = input_streams_dict['succ_observations'] + self.new_infos = input_streams_dict['succ_infos'] + + if hasattr(self, 'observations')\ + and self.agent.training: + self.agent.handle_experience( + s=self.observations, + a=self.actions, + r=input_streams_dict['rewards'], + succ_s=self.new_observations, + done=input_streams_dict['dones'], + infos=self.infos, + ) + + # TODO: maybe reset everything if no attr observations: + for actor_index in input_streams_dict['reset_actors']: + self.agent.reset_actors(indices=[actor_index]) + + self.new_actions = self.agent.take_action( + state=self.new_observations, + infos=self.new_infos, + ) \ + if self.agent.training else \ + self.agent.query_action( + state=self.new_observations, + infos=self.new_infos, + ) + + self.observations = copy.deepcopy(self.new_observations) + self.infos = copy.deepcopy(self.new_infos) + self.actions = copy.deepcopy(self.new_actions) + + outputs_streams_dict[self.config['actions_stream_id']] = copy.deepcopy(self.new_actions) + + return outputs_streams_dict diff --git a/regym/modules/supervised_learning_loop_module.py b/regym/modules/supervised_learning_loop_module.py new file mode 100644 index 00000000..99a746bc --- /dev/null +++ b/regym/modules/supervised_learning_loop_module.py @@ -0,0 +1,349 @@ +from typing import Dict, List + +import os +import math +import copy +import time +from tqdm import tqdm +import numpy as np + +import regym +from tensorboardX import SummaryWriter + +from regym.thirdparty.ReferentialGym.datasets import collate_dict_wrapper, PrioritizedSampler +from regym.modules.module import Module + + +def build_SupervisedLearningLoopModule( + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None) -> Module: + return SupervisedLearningLoopModule( + id=id, + config=config, + input_stream_ids=input_stream_ids + ) + + +class SupervisedLearningLoopModule(Module): + def __init__(self, + id:str, + config:Dict[str,object], + input_stream_ids:Dict[str,str]=None): + + default_input_stream_ids = { + "logger":"modules:logger:ref", + "logs_dict":"logs_dict", + "stream_handler":"stream_handler", + + "iteration":"signals:iteration", + } + + if input_stream_ids is None: + input_stream_ids = default_input_stream_ids + else: + for default_id, default_stream in default_input_stream_ids.items(): + if default_id not in input_stream_ids.keys(): + input_stream_ids[default_id] = default_stream + + super(SupervisedLearningLoopModule, self).__init__( + id=id, + type="SupervisedLearningLoopModule", + config=config, + input_stream_ids=input_stream_ids + ) + + # Dataset: + if 'batch_size' not in self.config: + self.config['batch_size'] = 32 + if 'dataloader_num_worker' not in self.config: + self.config['dataloader_num_worker'] = 1 + + print("Create dataloader: ...") + + self.datasets = self.config["datasets"] + self.use_priority = self.config["use_priority"] + self.logger = self.config["logger"] + + self.data_loaders = {} + self.pbss = {} + print("WARNING: 'dataloader_num_worker' hyperparameter has been de-activated.") + for mode, dataset in self.datasets.items(): + if 'train' in mode and self.use_priority: + capacity = len(dataset) + + pbs = PrioritizedSampler( + capacity=capacity, + batch_size=self.config['batch_size'], + logger=self.logger, + ) + self.pbss[mode] = pbs + self.data_loaders[mode] = torch.utils.data.DataLoader( + dataset, + batch_size=self.config['batch_size'], + collate_fn=collate_dict_wrapper, + pin_memory=True, + #num_workers=self.config['dataloader_num_worker'], + sampler=pbs, + ) + else: + self.data_loaders[mode] = torch.utils.data.DataLoader( + dataset, + batch_size=self.config['batch_size'], + shuffle=True, + collate_fn=collate_dict_wrapper, + pin_memory=True, + #num_workers=self.config['dataloader_num_worker'] + ) + + print("Create dataloader: OK.") + + + def compute(self, input_streams_dict:Dict[str,object]) -> Dict[str,object] : + """ + """ + + self.stream_handler = input_streams_dict["stream_handler"] + logger = input_streams_dict["logger"] + nbr_epoch = self.config["nbr_epoch"] + verbose_period = 1 + + print("Launching training: ...") + + it_datasamples = self.stream_handler['signals:it_datasamples'] + if it_datasamples is None: it_datasamples = {mode:0 for mode in self.datasets} # counting the number of data sampled from dataloaders + it_samples = self.stream_handler['signals:it_samples'] + if it_samples is None: it_samples = {mode:0 for mode in self.datasets} # counting the number of multi-round + it_steps = self.stream_handler['signals:it_steps'] + if it_steps is None: it_steps = {mode:0 for mode in self.datasets} # taking into account multi round... counting the number of sample shown to the agents. + + init_curriculum_nbr_distractors = self.stream_handler["signals:curriculum_nbr_distractors"] + if init_curriculum_nbr_distractors is None: + init_curriculum_nbr_distractors = 1 + if self.config.get('use_curriculum_nbr_distractors', False): + windowed_accuracy = 0.0 + window_count = 0 + for mode in self.datasets: + self.datasets[mode].setNbrDistractors(init_curriculum_nbr_distractors,mode=mode) + + pbar = tqdm(total=nbr_epoch) + if logger is not None: + self.stream_handler.update("modules:logger:ref", logger) + + self.stream_handler.update("signals:use_cuda", self.config['use_cuda']) + self.stream_handler.update("signals:update_count", 0) + + init_epoch = self.stream_handler["signals:epoch"] + if init_epoch is None: + init_epoch = 0 + else: + pbar.update(init_epoch) + + + outputs_stream_dict = {} + + self.stream_handler.update("signals:done_supervised_learning_training", False) + epoch = init_epoch-1 + + while self.stream_handler["signals:done_supervised_learning_training"]: + #for epoch in range(init_epoch,nbr_epoch): + epoch += 1 + if epoch > nbr_epoch: + break + + self.stream_handler.update("signals:epoch", epoch) + pbar.update(1) + for it_dataset, (mode, data_loader) in enumerate(data_loaders.items()): + self.stream_handler.update("current_dataset:ref", self.datasets[mode]) + self.stream_handler.update("signals:mode", mode) + + end_of_epoch_dataset = (it_dataset==len(data_loaders)-1) + self.stream_handler.update("signals:end_of_epoch_dataset", end_of_epoch_dataset) + + nbr_experience_repetition = 1 + if 'nbr_experience_repetition' in self.config\ + and 'train' in mode: + nbr_experience_repetition = self.config['nbr_experience_repetition'] + + for idx_stimulus, sample in enumerate(data_loader): + end_of_dataset = (idx_stimulus==len(data_loader)-1) + self.stream_handler.update("signals:end_of_dataset", end_of_dataset) + it_datasamples[mode] += 1 + it_datasample = it_datasamples[mode] + self.stream_handler.update("signals:it_datasamples", it_datasamples) + self.stream_handler.update("signals:global_it_datasample", it_datasample) + self.stream_handler.update("signals:it_datasample", idx_stimulus) + it = it_datasample + + + if self.config['use_cuda']: + sample = sample.cuda() + + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + for it_rep in range(nbr_experience_repetition): + it_samples[mode] += 1 + it_sample = it_samples[mode] + self.stream_handler.update("signals:it_samples", it_samples) + self.stream_handler.update("signals:global_it_sample", it_sample) + self.stream_handler.update("signals:it_sample", it_rep) + end_of_repetition_sequence = (it_rep==nbr_experience_repetition-1) + self.stream_handler.update("signals:end_of_repetition_sequence", end_of_repetition_sequence) + + # TODO: implement a multi_round_communicatioin module ? + for idx_round in range(self.config['nbr_communication_round']): + it_steps[mode] += 1 + it_step = it_steps[mode] + + self.stream_handler.update("signals:it_steps", it_steps) + self.stream_handler.update("signals:global_it_step", it_step) + self.stream_handler.update("signals:it_step", idx_round) + + end_of_communication = (idx_round==self.config['nbr_communication_round']-1) + self.stream_handler.update("signals:end_of_communication", end_of_communication) + + multi_round = True + if end_of_communication: + multi_round = False + self.stream_handler.update("signals:multi_round", multi_round) + + #self.stream_handler.update('current_dataloader:sample', sample) + outputs_stream_dict["current_dataloader:sample"] = sample + + yield outputs_stream_dict + + """ + for pipe_id, pipeline in self.pipelines.items(): + if "referential_game" in pipe_id: + self.stream_handler.serve(pipeline) + """ + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + """ + for pipe_id, pipeline in self.pipelines.items(): + if "referential_game" not in pipe_id: + self.stream_handler.serve(pipeline) + """ + + losses = self.stream_handler["losses_dict"] + + if self.use_priority and mode in self.pbss: + batched_loss = sum([l for l in losses.values()]).detach().cpu().numpy() + if len(batched_loss): + self.pbss[mode].update_batch(batched_loss) + + loss = sum( [l.mean() for l in losses.values()]) + logs_dict = self.stream_handler["logs_dict"] + acc_keys = [k for k in logs_dict.keys() if '/referential_game_accuracy' in k] + if len(acc_keys): + acc = logs_dict[acc_keys[-1]].mean() + + if verbose_period is not None and idx_stimulus % verbose_period == 0: + descr = f"Epoch {epoch+1} :: {mode} Iteration {idx_stimulus+1}/{len(data_loader)}" + if isinstance(loss, torch.Tensor): loss = loss.item() + descr += f" (Rep:{it_rep+1}/{nbr_experience_repetition}):: Loss {it+1} = {loss}" + pbar.set_description_str(descr) + + self.stream_handler.reset("losses_dict") + self.stream_handler.reset("logs_dict") + + ''' + if logger is not None: + if self.config['with_utterance_penalization'] or self.config['with_utterance_promotion']: + import ipdb; ipdb.set_trace() + for widx in range(self.config['vocab_size']+1): + logger.add_scalar("{}/Word{}Counts".format(mode,widx), speaker_outputs['speaker_utterances_count'][widx], it_step) + logger.add_scalar("{}/OOVLoss".format(mode), speaker_losses['oov_loss'][-1].mean().item(), it_step) + + if 'with_mdl_principle' in self.config and self.config['with_mdl_principle']: + logger.add_scalar("{}/MDLLoss".format(mode), speaker_losses['mdl_loss'][-1].mean().item(), it_step) + ''' + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + # TODO: CURRICULUM ON DISTRATORS as a module that handles the current dataloader reference....!! + if 'use_curriculum_nbr_distractors' in self.config\ + and self.config['use_curriculum_nbr_distractors']: + nbr_distractors = self.datasets[mode].getNbrDistractors(mode=mode) + self.stream_handler.update("signals:curriculum_nbr_distractors", nbr_distractors) + logger.add_scalar( "{}/CurriculumNbrDistractors".format(mode), nbr_distractors, it_step) + logger.add_scalar( "{}/CurriculumWindowedAcc".format(mode), windowed_accuracy, it_step) + + + # TODO: make this a logger module: + """ + if 'current_speaker' in self.modules and 'current_listener' in self.modules: + prototype_speaker = self.stream_handler["modules:current_speaker:ref_agent"] + prototype_listener = self.stream_handler["modules:current_listener:ref_agent"] + image_save_path = logger.path + if prototype_speaker is not None and hasattr(prototype_speaker,'VAE') and idx_stimulus % 4 == 0: + query_vae_latent_space(prototype_speaker.VAE, + sample=sample['speaker_experiences'], + path=image_save_path, + test=('test' in mode), + full=('test' in mode), + idxoffset=it_rep+idx_stimulus*self.config['nbr_experience_repetition'], + suffix='speaker', + use_cuda=True) + + if prototype_listener is not None and hasattr(prototype_listener,'VAE') and idx_stimulus % 4 == 0: + query_vae_latent_space(prototype_listener.VAE, + sample=sample['listener_experiences'], + path=image_save_path, + test=('test' in mode), + full=('test' in mode), + idxoffset=idx_stimulus, + suffix='listener') + """ + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + # TODO: many parts everywhere, do not forget them all : CURRICULUM ON DISTRACTORS...!!! + if 'train' in mode\ + and 'use_curriculum_nbr_distractors' in self.config\ + and self.config['use_curriculum_nbr_distractors']: + nbr_distractors = self.datasets[mode].getNbrDistractors(mode=mode) + windowed_accuracy = (windowed_accuracy*window_count+acc.item()) + window_count += 1 + windowed_accuracy /= window_count + if windowed_accuracy > 75 and window_count > self.config['curriculum_distractors_window_size'] and nbr_distractors < self.config['nbr_distractors'][mode]: + windowed_accuracy = 0 + window_count = 0 + for mode in self.datasets: + self.datasets[mode].setNbrDistractors(self.datasets[mode].getNbrDistractors(mode=mode)+1, mode=mode) + # //------------------------------------------------------------// + + if logger is not None: + logger.switch_epoch() + + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + """ + if self.save_epoch_interval is not None\ + and epoch % self.save_epoch_interval == 0: + self.save(path=self.save_path) + """ + + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + # //------------------------------------------------------------// + # //------------------------------------------------------------// + # //------------------------------------------------------------// + + outputs_stream_dict["signals:done_supervised_learning_training"] = True + + return outputs_stream_dict + + + + diff --git a/regym/pubsub_manager/__init__.py b/regym/pubsub_manager/__init__.py new file mode 100644 index 00000000..9b489da0 --- /dev/null +++ b/regym/pubsub_manager/__init__.py @@ -0,0 +1 @@ +from .pubsub_manager import PubSubManager \ No newline at end of file diff --git a/regym/pubsub_manager/pubsub_manager.py b/regym/pubsub_manager/pubsub_manager.py new file mode 100644 index 00000000..be509fd3 --- /dev/null +++ b/regym/pubsub_manager/pubsub_manager.py @@ -0,0 +1,188 @@ +from typing import Dict, List, Tuple +import os +import pickle +import glob + +import torch + +from tensorboardX import SummaryWriter +from tqdm import tqdm + +from regym.thirdparty.ReferentialGym.ReferentialGym.utils import StreamHandler + + +VERBOSE = False + + +class PubSubManager(object): + def __init__(self, + config={}, + modules={}, + pipelines={}, + load_path=None, + save_path=None, + verbose=False, + logger=None, + save_epoch_interval=None): + self.verbose = verbose + self.save_epoch_interval = save_epoch_interval + + self.load_path= load_path + self.save_path = save_path + + self.config = config + if load_path is not None: + self.load_config(load_path) + + self.stream_handler = StreamHandler() + self.stream_handler.register("losses_dict") + self.stream_handler.register("logs_dict") + self.stream_handler.register("signals") + if load_path is not None: + self.load_signals(load_path) + + # Register hyperparameters: + for k,v in self.config.items(): + self.stream_handler.update(f"config:{k}", v) + # Register modules: + self.modules = modules + if load_path is not None: + self.load_modules(load_path) + for k,m in self.modules.items(): + self.stream_handler.update(f"modules:{m.get_id()}:ref", m) + + if logger is not None: + self.stream_handler.update("modules:logger:ref", logger) + + # Register pipelines: + self.pipelines = pipelines + if load_path is not None: + self.load_pipelines(load_path) + + def save(self, path=None): + if path is None: + print("WARNING: no path provided for save. Saving in './temp_save/'.") + path = './temp_save/' + + os.makedirs(path, exist_ok=True) + + self.save_config(path) + self.save_modules(path) + self.save_pipelines(path) + self.save_signals(path) + + if self.verbose: + print(f"Saving at {path}: OK.") + + def save_config(self, path): + try: + with open(os.path.join(path, "config.conf"), 'wb') as f: + pickle.dump(self.config, f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception as e: + print(f"Exception caught while trying to save config: {e}") + + def save_modules(self, path): + for module_id, module in self.modules.items(): + #try: + if hasattr(module, "save"): + module.save(path=path) + else: + torch.save(module, os.path.join(path,module_id+".pth")) + #except Exception as e: + # print(f"Exception caught will trying to save module {module_id}: {e}") + + + def save_pipelines(self, path): + try: + with open(os.path.join(path, "pipelines.pipe"), 'wb') as f: + pickle.dump(self.pipelines, f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception as e: + print(f"Exception caught while trying to save pipelines: {e}") + + def save_signals(self, path): + try: + with open(os.path.join(path, "signals.conf"), 'wb') as f: + pickle.dump(self.stream_handler["signals"], f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception as e: + print(f"Exception caught while trying to save signals: {e}") + + def load(self, path): + self.load_config(path) + self.load_modules(path) + self.load_pipelines(path) + self.load_signals(path) + + if self.verbose: + print(f"Loading from {path}: OK.") + + + def load_config(self, path): + try: + with open(os.path.join(path, "config.conf"), 'rb') as f: + self.config = pickle.load(f) + except Exception as e: + print(f"Exception caught while trying to load config: {e}") + + if self.verbose: + print(f"Loading config: OK.") + + def load_modules(self, path): + modules_paths = glob.glob(os.path.join(path, "*.pth")) + + for module_path in modules_paths: + module_id = module_path.split("/")[-1].split(".")[0] + try: + self.modules[module_id] = torch.load(module_path) + except Exception as e: + print(f"Exception caught will trying to load module {module_path}: {e}") + + if self.verbose: + print(f"Loading modules: OK.") + + def load_pipelines(self, path): + try: + with open(os.path.join(path, "pipelines.pipe"), 'rb') as f: + self.pipelines.update(pickle.load(f)) + except Exception as e: + print(f"Exception caught while trying to load pipelines: {e}") + + if self.verbose: + print(f"Loading pipelines: OK.") + + def load_signals(self, path): + try: + with open(os.path.join(path, "signals.conf"), 'rb') as f: + self.stream_handler.update("signals", pickle.load(f)) + except Exception as e: + print(f"Exception caught while trying to load signals: {e}") + + if self.verbose: + print(f"Loading signals: OK.") + + def train(self): + iteration = -1 + + while True: + iteration += 1 + #pbar.update(1) + + self.stream_handler.update("signals:iteration", iteration) + self.stream_handler.update("signals:global_it_step", iteration) + + for pipe_id, pipeline in self.pipelines.items(): + self.stream_handler.serve(pipeline) + + self.stream_handler.reset("losses_dict") + self.stream_handler.reset("logs_dict") + + # TODO: define how to stop the loop? + if self.stream_handler["signals:done_training"]: + break + # TODO: define how to make checkpoints: + """ + if self.save_epoch_interval is not None\ + and epoch % self.save_epoch_interval == 0: + self.save(path=self.save_path) + """ + + return diff --git a/regym/rl_algorithms/agents/__init__.py b/regym/rl_algorithms/agents/__init__.py index 673b914e..4a28e1a1 100755 --- a/regym/rl_algorithms/agents/__init__.py +++ b/regym/rl_algorithms/agents/__init__.py @@ -1,9 +1,11 @@ from .gym_rock_paper_scissors_agent import MixedStrategyAgent from .tabular_q_learning_agent import build_TabularQ_Agent, TabularQLearningAgent from .dqn_agent import build_DQN_Agent, DQNAgent +from .dqn_her_agent import build_DQN_HER_Agent, DQNHERAgent from .r2d2_agent import build_R2D2_Agent, R2D2Agent from .r2d3_agent import build_R2D3_Agent, R2D3Agent from .ther_agent import build_THER_Agent, THERAgent +from .ther2_agent import build_THER2_Agent, THER2Agent from .ppo_agent import build_PPO_Agent, PPOAgent from .reinforce_agent import build_Reinforce_Agent, ReinforceAgent from .a2c_agent import build_A2C_Agent, A2CAgent @@ -12,6 +14,9 @@ from .sac_agent import build_SAC_Agent, SACAgent from .i2a_agent import build_I2A_Agent, I2AAgent from .deterministic_agent import build_Deterministic_Agent, DeterministicAgent +from .random_agent import build_Random_Agent, RandomAgent + +from .utils import generate_model rockAgent = MixedStrategyAgent(support_vector=[1, 0, 0], name='RockAgent') paperAgent = MixedStrategyAgent(support_vector=[0, 1, 0], name='PaperAgent') diff --git a/regym/rl_algorithms/agents/agent.py b/regym/rl_algorithms/agents/agent.py index 36235ac8..94a0aa45 100644 --- a/regym/rl_algorithms/agents/agent.py +++ b/regym/rl_algorithms/agents/agent.py @@ -90,6 +90,10 @@ def __init__(self, name, algorithm): if len(self.rnn_keys): self.recurrent = True """ + + def parameters(self): + return self.algorithm.parameters() + @property def handled_experiences(self): if isinstance(self._handled_experiences, ray.actor.ActorHandle): @@ -110,19 +114,36 @@ def get_experience_count(self): def get_update_count(self): raise NotImplementedError - def set_nbr_actor(self, nbr_actor:int): + def get_nbr_actor(self): + return self.nbr_actor + + def set_nbr_actor(self, nbr_actor:int, vdn:Optional[bool]=None, training:Optional[bool]=None): if nbr_actor != self.nbr_actor: self.nbr_actor = nbr_actor - self.reset_actors(init=True) self.algorithm.set_nbr_actor(nbr_actor=self.nbr_actor) - self.algorithm.reset_storages(nbr_actor=self.nbr_actor) + if training is None: + self.algorithm.reset_storages(nbr_actor=self.nbr_actor) + else: + self.training = training + self.reset_actors(init=True, vdn=vdn) + + def get_rnn_states(self): + return self.rnn_states - def reset_actors(self, indices:Optional[List]=[], init:Optional[bool]=False): + def set_rnn_states(self, rnn_states): + self.rnn_states = rnn_states + + def reset_actors(self, indices:Optional[List]=[], init:Optional[bool]=False, vdn=None): ''' In case of a multi-actor process, this function is called to reset the actors' internal values. ''' - self.current_prediction: Dict[str, Any] = None + # the following is interfering with rl_agent_module + # that operates on a delay with MARLEnvironmentModule + # when it comes to the time prediction is made + # and then the time when an experience is handled. + # TODO: make sure that disabling it is not affecting other behaviours... + #self.current_prediction: Dict[str, Any] = None if init: self.previously_done_actors = [False]*self.nbr_actor @@ -130,7 +151,7 @@ def reset_actors(self, indices:Optional[List]=[], init:Optional[bool]=False): for idx in indices: self.previously_done_actors[idx] = False if self.recurrent: - _, self.rnn_states = self._reset_rnn_states(self.algorithm, self.nbr_actor, actor_indices=indices) + _, self.rnn_states = self._reset_rnn_states(self.algorithm, self.nbr_actor, actor_indices=indices, vdn=vdn) def update_actors(self, batch_idx:int): """ @@ -181,9 +202,10 @@ def remove_from_goals(self, batch_idx:int): self.goals[batch_idx+1:,...]], axis=0) - def _reset_rnn_states(self, algorithm: object, nbr_actor: int, actor_indices: Optional[List[int]]=[]): + def _reset_rnn_states(self, algorithm: object, nbr_actor: int, actor_indices: Optional[List[int]]=[], vdn:Optional[bool]=None): # TODO: account for the indices in rnn states: - if "vdn" in self.algorithm.kwargs \ + if ((vdn is not None and vdn) or (vdn is None))\ + and "vdn" in self.algorithm.kwargs \ and self.algorithm.kwargs["vdn"]: nbr_players = self.algorithm.kwargs["vdn_nbr_players"] nbr_envs = nbr_actor @@ -244,13 +266,13 @@ def remove_from_rnn_states(self, batch_idx:int, rnn_states_dict:Optional[Dict]=N dim=0 ) - def _pre_process_rnn_states(self, rnn_states_dict: Optional[Dict]=None): + def _pre_process_rnn_states(self, rnn_states_dict: Optional[Dict]=None, vdn:Optional[bool]=None): ''' :param map_keys: List of keys we map the operation to. ''' if rnn_states_dict is None: if self.rnn_states is None: - _, self.rnn_states = self._reset_rnn_states(self.algorithm, self.nbr_actor) + _, self.rnn_states = self._reset_rnn_states(self.algorithm, self.nbr_actor, vdn=vdn) rnn_states_dict = self.rnn_states @staticmethod @@ -287,6 +309,42 @@ def _post_process_and_update_rnn_states(next_rnn_states_dict: Dict, rnn_states_d # only post-process: rnn_states_dict[recurrent_submodule_name][key][idx] = rnn_states_dict[recurrent_submodule_name][key][idx].detach().cpu() + @staticmethod + def _keep_grad_update_rnn_states(next_rnn_states_dict: Dict, rnn_states_dict: Dict, map_keys: Optional[List]=None): + ''' + Update the rnn_state to the values of next_rnn_states, when present in both. + Otherwise, simply detach+cpu the values. + + :param next_rnn_states_dict: Dict with a hierarchical structure. + :param rnn_states_dict: Dict with a hierarchical structure, ends up being update when possible. + :param map_keys: List of keys we map the operation to. + ''' + for recurrent_submodule_name in rnn_states_dict: + if not is_leaf(rnn_states_dict[recurrent_submodule_name]): + Agent._keep_grad_update_rnn_states( + next_rnn_states_dict=next_rnn_states_dict[recurrent_submodule_name], + rnn_states_dict=rnn_states_dict[recurrent_submodule_name] + ) + else: + eff_map_keys = map_keys if map_keys is not None else rnn_states_dict[recurrent_submodule_name].keys() + for key in eff_map_keys: + updateable = False + """ + if key in next_rnn_states_dict[recurrent_submodule_name]: + updateable = True + for idx in range(len(next_rnn_states_dict[recurrent_submodule_name][key])): + # Post-process: + next_rnn_states_dict[recurrent_submodule_name][key][idx] = next_rnn_states_dict[recurrent_submodule_name][key][idx].detach().cpu() + """ + if key in rnn_states_dict[recurrent_submodule_name]: + for idx in range(len(rnn_states_dict[recurrent_submodule_name][key])): + if updateable: + # Updating rnn_states: + rnn_states_dict[recurrent_submodule_name][key][idx] = next_rnn_states_dict[recurrent_submodule_name][key][idx]#.detach().cpu() + else: + # only post-process: + rnn_states_dict[recurrent_submodule_name][key][idx] = rnn_states_dict[recurrent_submodule_name][key][idx]#.detach().cpu() + def _post_process(self, prediction: Dict[str, Any]): """ Post-process a prediction by detaching-cpuing the tensors. @@ -363,7 +421,7 @@ def handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None): def train(self): raise NotImplementedError - def take_action(self, state): + def take_action(self, state, as_logit=False): raise NotImplementedError def clone(self, training=None, with_replay_buffer=False, clone_proxies=False, minimal=False): @@ -408,11 +466,12 @@ def __init__(self, name, algorithm, extra_inputs_infos): algorithm=algorithm ) - def _reset_rnn_states(self, algorithm: object, nbr_actor: int, actor_indices: Optional[List[int]]=None): + def _reset_rnn_states(self, algorithm: object, nbr_actor: int, actor_indices: Optional[List[int]]=None, vdn:Optional[bool]=None): self.rnn_keys, self.rnn_states = super()._reset_rnn_states( algorithm=algorithm, nbr_actor=nbr_actor, - actor_indices=actor_indices + actor_indices=actor_indices, + vdn=vdn ) @@ -430,12 +489,15 @@ def _init_hdict(self, init:Optional[Dict]={}): hdict = {} for key in self.extra_inputs_infos: value = init.get(key, torch.cat([self.dummies[key]]*self.nbr_actor, dim=0)) - pointer = hdict - for child_node in self.extra_inputs_infos[key]['target_location']: - if child_node not in pointer: - pointer[child_node] = {} - pointer = pointer[child_node] - pointer[key] = [value] + if not isinstance(self.extra_inputs_infos[key]['target_location'][0], list): + self.extra_inputs_infos[key]['target_location'] = [self.extra_inputs_infos[key]['target_location']] + for tl in self.extra_inputs_infos[key]['target_location']: + pointer = hdict + for child_node in tl: + if child_node not in pointer: + pointer[child_node] = {} + pointer = pointer[child_node] + pointer[key] = [value] return hdict def _build_dict_from(self, lhdict: Dict): @@ -449,17 +511,28 @@ def _build_dict_from(self, lhdict: Dict): return out_hdict - def take_action(self, state, infos=None): + def take_action(self, state, infos=None, as_logit=False): hdict = None if infos:# and not self.training: agent_infos = [info for info in infos if info is not None] hdict = self._build_dict_from(lhdict=agent_infos) recursive_inplace_update(self.rnn_states, hdict) - return self._take_action(state, infos=hdict) + return self._take_action(state, infos=hdict, as_logit=as_logit) + + def query_action(self, state, infos=None, as_logit=False): + hdict = None + if infos:# and not self.training: + agent_infos = [info for info in infos if info is not None] + hdict = self._build_dict_from(lhdict=agent_infos) + recursive_inplace_update(self.rnn_states, hdict) + return self._query_action(state, infos=hdict, as_logit=as_logit) def _take_action(self, state, infos=None): raise NotImplementedError + def _query_action(self, state, infos=None): + raise NotImplementedError + def handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None): ''' Wrapper around the actual function now living in _handle_experience. diff --git a/regym/rl_algorithms/agents/dqn_agent.py b/regym/rl_algorithms/agents/dqn_agent.py index 9dadc5fc..dfb38e05 100644 --- a/regym/rl_algorithms/agents/dqn_agent.py +++ b/regym/rl_algorithms/agents/dqn_agent.py @@ -6,12 +6,8 @@ from collections.abc import Iterable from ..algorithms.DQN import DQNAlgorithm, dqn_loss, ddqn_loss -from ..networks import CategoricalQNet -from ..networks import FCBody, FCBody2, LSTMBody, GRUBody, ConvolutionalBody, BetaVAEBody, resnet18Input64 -from ..networks import ConvolutionalGruBody, ConvolutionalLstmBody -from ..networks import LinearLinearBody, LinearLstmBody, LinearLstmBody2 -from ..networks import NoisyLinear from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction +from regym.rl_algorithms.agents.utils import generate_model import ray import torch @@ -22,7 +18,7 @@ from .agent import Agent from .wrappers import DictHandlingAgentWrapper -from gym.spaces import Dict +from gym.spaces import Dict as gymDict from ..algorithms.wrappers import HERAlgorithmWrapper from regym.rl_algorithms.utils import _extract_from_rnn_states, copy_hdict from regym.rl_algorithms.utils import apply_on_hdict, _concatenate_list_hdict @@ -56,7 +52,7 @@ def __init__(self, name, algorithm): self.previous_save_quotient = -1 def get_update_count(self): - return self.algorithm.get_update_count() + return self.algorithm.unwrapped.get_update_count() def handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None, prediction=None): ''' @@ -250,13 +246,15 @@ def reshape_fn(x): post_process_fn=(lambda x: x.detach().cpu()) ) + """ + # depr : goal update if self.goal_oriented: - raise NotImplementedError exp_dict['goals'] = Agent._extract_from_hdict( goals, batch_index, goal_preprocessing_fn=self.goal_preprocessing ) + """ self.algorithm.store(exp_dict, actor_index=actor_index) self.previously_done_actors[actor_index] = done[actor_index] @@ -281,7 +279,7 @@ def train(self): if self.training \ and self.handled_experiences > self.kwargs['min_capacity'] \ - and self.algorithm.stored_experiences() > self.kwargs['min_capacity'] \ + and self.algorithm.unwrapped.stored_experiences() > self.kwargs['min_capacity'] \ and (period_count_check % period_check == 0 or not(self.async_actor)): minibatch_size = self.kwargs['batch_size'] if self.nbr_episode_per_cycle is None: @@ -294,23 +292,23 @@ def train(self): nbr_updates = self.nbr_training_iteration_per_cycle - if self.algorithm.summary_writer is not None: + if self.algorithm.unwrapped.summary_writer is not None: if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) else: actor_learner_shared_dict = self.actor_learner_shared_dict.get() nbr_update_remaining = sum(actor_learner_shared_dict["models_update_required"]) - self.algorithm.summary_writer.add_scalar( + self.algorithm.unwrapped.summary_writer.add_scalar( f'PerUpdate/ActorLearnerSynchroRemainingUpdates', nbr_update_remaining, - self.algorithm.get_update_count() + self.algorithm.unwrapped.get_update_count() ) # Update actor's models: if self.async_learner\ and (self.handled_experiences // self.actor_models_update_steps_interval) != self.previous_actor_models_update_quotient: self.previous_actor_models_update_quotient = self.handled_experiences // self.actor_models_update_steps_interval - new_models_cpu = {k:deepcopy(m).cpu() for k,m in self.algorithm.get_models().items()} + new_models_cpu = {k:deepcopy(m).cpu() for k,m in self.algorithm.unwrapped.get_models().items()} if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) @@ -327,13 +325,13 @@ def train(self): if self.async_learner\ and self.save_path is not None \ - and (self.algorithm.get_update_count() // self.saving_interval) != self.previous_save_quotient: - self.previous_save_quotient = self.algorithm.get_update_count() // self.saving_interval + and (self.algorithm.unwrapped.get_update_count() // self.saving_interval) != self.previous_save_quotient: + self.previous_save_quotient = self.algorithm.unwrapped.get_update_count() // self.saving_interval self.save() return nbr_updates - def take_action(self, state, infos=None): + def take_action(self, state, infos=None, as_logit=False): if self.async_actor: # Update the algorithm's model if needs be: if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): @@ -350,13 +348,13 @@ def take_action(self, state, infos=None): if "models" in actor_learner_shared_dict.keys(): new_models = actor_learner_shared_dict["models"] - self.algorithm.set_models(new_models) + self.algorithm.unwrapped.set_models(new_models) else: raise NotImplementedError if self.training: self.nbr_steps += state.shape[0] - self.eps = self.algorithm.get_epsilon(nbr_steps=self.nbr_steps, strategy=self.epsdecay_strategy) + self.eps = self.algorithm.unwrapped.get_epsilon(nbr_steps=self.nbr_steps, strategy=self.epsdecay_strategy) if "vdn" in self.kwargs \ and self.kwargs["vdn"]: # The following will not make same values contiguous: @@ -365,18 +363,28 @@ def take_action(self, state, infos=None): self.eps = np.stack([self.eps]*self.kwargs["vdn_nbr_players"], axis=-1).reshape(-1) - state = self.state_preprocessing(state, use_cuda=self.algorithm.kwargs['use_cuda']) + state = self.state_preprocessing(state, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + + """ + # depr : goal update goal = None if self.goal_oriented: - goal = self.goal_preprocessing(self.goals, use_cuda=self.algorithm.kwargs['use_cuda']) + goal = self.goal_preprocessing(self.goals, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + """ - model = self.algorithm.get_models()['model'] + model = self.algorithm.unwrapped.get_models()['model'] if 'use_target_to_gather_data' in self.kwargs and self.kwargs['use_target_to_gather_data']: - model = self.algorithm.get_models()['target_model'] + model = self.algorithm.unwrapped.get_models()['target_model'] model = model.train(mode=self.training) - self.current_prediction = self.query_model(model, state, goal) + # depr : goal update + #self.current_prediction = self.query_model(model, state, goal) + self.current_prediction = self.query_model(model, state) + + if as_logit: + return self.current_prediction['log_a'] + # Post-process and update the rnn_states from the current prediction: # self.rnn_states <-- self.current_prediction['next_rnn_states'] # WARNING: _post_process affects self.rnn_states. It is imperative to @@ -385,6 +393,7 @@ def take_action(self, state, infos=None): self.current_prediction = self._post_process(self.current_prediction) greedy_action = self.current_prediction['a'].reshape((-1,1)).numpy() + if self.noisy or not(self.training): return greedy_action @@ -421,7 +430,114 @@ def take_action(self, state, infos=None): return actions - def query_model(self, model, state, goal): + def query_action(self, state, infos=None, as_logit=False): + """ + Query's the model in training mode... + """ + if self.async_actor: + # Update the algorithm's model if needs be: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + if actor_learner_shared_dict["models_update_required"][self.async_actor_idx]: + actor_learner_shared_dict["models_update_required"][self.async_actor_idx] = False + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if "models" in actor_learner_shared_dict.keys(): + new_models = actor_learner_shared_dict["models"] + self.algorithm.unwrapped.set_models(new_models) + else: + raise NotImplementedError + + self.eps = self.algorithm.unwrapped.get_epsilon(nbr_steps=self.nbr_steps, strategy=self.epsdecay_strategy) + if "vdn" in self.kwargs \ + and self.kwargs["vdn"]: + # The following will not make same values contiguous: + #self.eps = np.concatenate([self.eps]*self.kwargs["vdn_nbr_players"], axis=0) + # whereas the following will, and thus players in the same environment will explore similarly: + self.eps = np.stack([self.eps]*self.kwargs["vdn_nbr_players"], axis=-1).reshape(-1) + + + state = self.state_preprocessing(state, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + + """ + # depr : goal update + goal = None + if self.goal_oriented: + goal = self.goal_preprocessing(self.goals, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + """ + + model = self.algorithm.unwrapped.get_models()['model'] + if 'use_target_to_gather_data' in self.kwargs and self.kwargs['use_target_to_gather_data']: + model = self.algorithm.unwrapped.get_models()['target_model'] + if not(model.training): model = model.train(mode=True) + + # depr : goal update + #current_prediction = self.query_model(model, state, goal) + current_prediction = self.query_model(model, state) + + # 1) Post-process and update the rnn_states from the current prediction: + # self.rnn_states <-- self.current_prediction['next_rnn_states'] + # WARNING: _post_process affects self.rnn_states. It is imperative to + # manipulate a copy of it outside of the agent's manipulation, e.g. + # when feeding it to the models. + # 2) All the elements from the prediction dictionnary are being detached+cpued from the graph. + # Thus, here, we only want to update the rnn state: + + if as_logit: + self._keep_grad_update_rnn_states( + next_rnn_states_dict=current_prediction['next_rnn_states'], + rnn_states_dict=self.rnn_states + ) + return current_prediction + #return current_prediction['log_a'] + else: + current_prediction = self._post_process(current_prediction) + + greedy_action = current_prediction['a'].reshape((-1,1)).numpy() + + if self.noisy: + return greedy_action + + legal_actions = torch.ones_like(current_prediction['qa']) + if infos is not None\ + and 'head' in infos\ + and 'extra_inputs' in infos['head']\ + and 'legal_actions' in infos['head']['extra_inputs']: + legal_actions = infos['head']['extra_inputs']['legal_actions'][0] + # in case there are no legal actions for this agent in this current turn: + for actor_idx in range(legal_actions.shape[0]): + if legal_actions[actor_idx].sum() == 0: + legal_actions[actor_idx, ...] = 1 + sample = np.random.random(size=self.eps.shape) + greedy = (sample > self.eps) + greedy = np.reshape(greedy[:state.shape[0]], (state.shape[0],1)) + + #random_actions = [random.randrange(model.action_dim) for _ in range(state.shape[0])] + random_actions = [ + legal_actions[actor_idx].multinomial(num_samples=1).item() + for actor_idx in range(legal_actions.shape[0]) + ] + random_actions = np.reshape(np.array(random_actions), (state.shape[0],1)) + + actions = greedy*greedy_action + (1-greedy)*random_actions + + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + action_dict = { + 'action': actions, + 'greedy_action': greedy_action, + } + return action_dict + + return actions + + def query_model(self, model, state, goal=None): if self.recurrent: self._pre_process_rnn_states() # WARNING: it is imperative to make a copy @@ -444,7 +560,8 @@ def clone(self, training=None, with_replay_buffer=False, clone_proxies=False, mi minimal=minimal ) clone = DQNAgent(name=self.name, algorithm=cloned_algo) - + clone.save_path = self.save_path + clone.actor_learner_shared_dict = self.actor_learner_shared_dict clone._handled_experiences = self._handled_experiences clone.episode_count = self.episode_count @@ -506,454 +623,6 @@ def get_async_actor(self, training=None, with_replay_buffer=False): return clone -def generate_model(task: 'regym.environments.Task', kwargs: Dict) -> nn.Module: - phi_body = None - input_dim = list(task.observation_shape) - if 'goal_oriented' in kwargs and kwargs['goal_oriented']: - goal_input_shape = list(task.goal_shape) - if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: - if isinstance(input_dim, int): - input_dim = input_dim+goal_input_shape - else: - input_dim[-1] = input_dim[-1]+goal_input_shape[-1] - - if kwargs['phi_arch'] != 'None': - output_dim = kwargs['phi_arch_feature_dim'] - if kwargs['phi_arch'] == 'LSTM-RNN': - phi_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) - elif kwargs['phi_arch'] == 'GRU-RNN': - phi_body = GRUBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) - elif kwargs['phi_arch'] == 'MLP': - hidden_units=kwargs['phi_arch_hidden_units'] - hidden_units += [output_dim] - - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_phi_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'phi_body' in tl: - extra_inputs_infos_phi_body[key] = { - 'shape':shape, - 'target_location':tl - } - if extra_inputs_infos_phi_body == {}: - phi_body = FCBody( - input_dim, - hidden_units=hidden_units, - ) - else: - phi_body = FCBody2( - input_dim, - hidden_units=hidden_units, - extra_inputs_infos=extra_inputs_infos_phi_body - ) - - elif kwargs['phi_arch'] == 'CNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - if isinstance(kwargs['observation_resize_dim'], int): - input_height, input_width = kwargs['observation_resize_dim'], kwargs['observation_resize_dim'] - else: - input_height, input_width = kwargs['observation_resize_dim'] - - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=input_height, normalize_rgb_values=True) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], input_height, input_width] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - channels = [input_shape[0]] + kwargs['phi_arch_channels'] - kernels = kwargs['phi_arch_kernels'] - strides = kwargs['phi_arch_strides'] - paddings = kwargs['phi_arch_paddings'] - output_dim = kwargs['phi_arch_feature_dim'] - phi_body = ConvolutionalBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings) - elif kwargs['phi_arch'] == 'ResNet18': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - output_dim = kwargs['phi_arch_feature_dim'] - phi_body = resnet18Input64(input_shape=input_shape, output_dim=output_dim) - elif kwargs['phi_arch'] == 'CNN-GRU-RNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - channels = [input_shape[0]] + kwargs['phi_arch_channels'] - kernels = kwargs['phi_arch_kernels'] - strides = kwargs['phi_arch_strides'] - paddings = kwargs['phi_arch_paddings'] - output_dim = kwargs['phi_arch_hidden_units'][-1] - phi_body = ConvolutionalGruBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings, - hidden_units=kwargs['phi_arch_hidden_units']) - elif kwargs['phi_arch'] == 'CNN-LSTM-RNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - channels = [input_shape[0]] + kwargs['phi_arch_channels'] - kernels = kwargs['phi_arch_kernels'] - strides = kwargs['phi_arch_strides'] - paddings = kwargs['phi_arch_paddings'] - output_dim = kwargs['phi_arch_feature_dim'] # TODO: figure out if this breaks anything else - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_phi_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'phi_body' in tl: - extra_inputs_infos_phi_body[key] = { - 'shape':shape, - 'target_location':tl - } - - phi_body = ConvolutionalLstmBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings, - extra_inputs_infos=extra_inputs_infos_phi_body, - hidden_units=kwargs['phi_arch_hidden_units']) - input_dim = output_dim - - - goal_phi_body = None - if 'goal_oriented' in kwargs and kwargs['goal_oriented']: - goal_input_shape = task.goal_shape - if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: - kwargs['goal_preprocess'] = kwargs['state_preprocess'] - - if 'goal_state_shared_arch' in kwargs and kwargs['goal_state_shared_arch']: - kwargs['goal_preprocess'] = kwargs['state_preprocess'] - if 'preprocessed_observation_shape' in kwargs: - kwargs['preprocessed_goal_shape'] = kwargs['preprocessed_observation_shape'] - goal_input_shape = kwargs['preprocessed_goal_shape'] - goal_phi_body = None - - elif kwargs['goal_phi_arch'] != 'None': - output_dim = 256 - if kwargs['goal_phi_arch'] == 'LSTM-RNN': - phi_body = LSTMBody(goal_input_shape, hidden_units=(output_dim,), gate=F.leaky_relu) - elif kwargs['goal_phi_arch'] == 'GRU-RNN': - phi_body = GRUBody(goal_input_shape, hidden_units=(output_dim,), gate=F.leaky_relu) - elif kwargs['goal_phi_arch'] == 'MLP': - phi_body = FCBody(goal_input_shape, hidden_units=(output_dim, ), gate=F.leaky_relu) - elif kwargs['goal_phi_arch'] == 'CNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_goal_shape'] - channels = [goal_shape[0]] + kwargs['goal_phi_arch_channels'] - kernels = kwargs['goal_phi_arch_kernels'] - strides = kwargs['goal_phi_arch_strides'] - paddings = kwargs['goal_phi_arch_paddings'] - output_dim = kwargs['goal_phi_arch_feature_dim'] - phi_body = ConvolutionalBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings) - elif kwargs['goal_phi_arch'] == 'ResNet18': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_goal_shape'] - output_dim = kwargs['goal_phi_arch_feature_dim'] - phi_body = resnet18Input64(input_shape=input_shape, output_dim=output_dim) - elif kwargs['goal_phi_arch'] == 'CNN-GRU-RNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_goal_shape'] - channels = [input_shape[0]] + kwargs['goal_phi_arch_channels'] - kernels = kwargs['goal_phi_arch_kernels'] - strides = kwargs['goal_phi_arch_strides'] - paddings = kwargs['goal_phi_arch_paddings'] - output_dim = kwargs['goal_phi_arch_hidden_units'][-1] - goal_phi_body = ConvolutionalGruBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings, - hidden_units=kwargs['phi_arch_hidden_units']) - input_dim += output_dim - - - critic_body = None - layer_fn = nn.Linear - if kwargs['noisy']: layer_fn = NoisyLinear - if kwargs['critic_arch'] != 'None': - output_dim = 256 - if kwargs['critic_arch'] == 'LSTM-RNN': - #critic_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) - state_dim = input_dim - critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_critic_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'critic_body' in tl: - extra_inputs_infos_critic_body[key] = { - 'shape':shape, - 'target_location':tl - } - - gate = None - if 'use_relu_after_rnn' in kwargs \ - and kwargs['use_relu_after_rnn']: - import ipdb; ipdb.set_trace() - gate = F.relu - - critic_body = LSTMBody( - state_dim=state_dim, - hidden_units=critic_arch_hidden_units, - gate=gate, - extra_inputs_infos=extra_inputs_infos_critic_body, - ) - elif kwargs['critic_arch'] == 'GRU-RNN': - state_dim = input_dim - critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_critic_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'critic_body' in tl: - extra_inputs_infos_critic_body[key] = { - 'shape':shape, - 'target_location':tl - } - - gate = None - if 'use_relu_after_rnn' in kwargs \ - and kwargs['use_relu_after_rnn']: - import ipdb; ipdb.set_trace() - gate = F.relu - - critic_body = GRUBody( - state_dim=state_dim, - hidden_units=critic_arch_hidden_units, - gate=gate, - extra_inputs_infos=extra_inputs_infos_critic_body, - ) - elif kwargs['critic_arch'] == 'MLP': - hidden_units=(output_dim,) - if 'critic_arch_hidden_units' in kwargs: - hidden_units = list(kwargs['critic_arch_hidden_units']) - critic_body = FCBody(input_dim, hidden_units=hidden_units, gate=F.leaky_relu) - elif kwargs['critic_arch'] == 'CNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - channels = [input_shape[0]] + kwargs['critic_arch_channels'] - kernels = kwargs['critic_arch_kernels'] - strides = kwargs['critic_arch_strides'] - paddings = kwargs['critic_arch_paddings'] - output_dim = kwargs['critic_arch_feature_dim'] - critic_body = ConvolutionalBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings) - elif kwargs['critic_arch'] == 'MLP-LSTM-RNN': - # Assuming flatten input: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - state_dim = input_dim - critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] - critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_critic_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'critic_body' in tl: - extra_inputs_infos_critic_body[key] = { - 'shape':shape, - 'target_location':tl - } - - gate = None - if 'use_relu_after_rnn' in kwargs \ - and kwargs['use_relu_after_rnn']: - import ipdb; ipdb.set_trace() - gate = F.relu - - critic_body = LinearLstmBody( - state_dim=state_dim, - feature_dim=critic_arch_feature_dim, - hidden_units=critic_arch_hidden_units, - non_linearities=[nn.ReLU], - gate=gate, - dropout=0.0, - add_non_lin_final_layer=True, - layer_init_fn=None, - extra_inputs_infos=extra_inputs_infos_critic_body, - ) - - elif kwargs['critic_arch'] == 'MLP-MLP-RNN': - # Assuming flatten input: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - state_dim = input_dim - critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] - critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_critic_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'critic_body' in tl: - extra_inputs_infos_critic_body[key] = { - 'shape':shape, - 'target_location':tl - } - - critic_body = LinearLinearBody( - state_dim=state_dim, - feature_dim=critic_arch_feature_dim, - hidden_units=critic_arch_hidden_units, - non_linearities=[nn.ReLU], - gate=F.relu, - dropout=0.0, - add_non_lin_final_layer=True, - layer_init_fn=None, - extra_inputs_infos=extra_inputs_infos_critic_body, - ) - elif kwargs['critic_arch'] == 'MLP-LSTM-RNN2': - # Assuming flatten input: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - state_dim = input_dim - critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] - critic_arch_linear_hidden_units = kwargs['critic_arch_linear_hidden_units'] - critic_arch_linear_post_hidden_units = None - if 'critic_arch_linear_post_hidden_units' in kwargs: - critic_arch_linear_post_hidden_units = kwargs['critic_arch_linear_post_hidden_units'] - critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] - - # Selecting Extra Inputs Infos relevant to phi_body: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_critic_body = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'critic_body' in tl: - extra_inputs_infos_critic_body[key] = { - 'shape':shape, - 'target_location':tl - } - - gate = None - if 'use_relu_after_rnn' in kwargs \ - and kwargs['use_relu_after_rnn']: - import ipdb; ipdb.set_trace() - gate = F.relu - - critic_body = LinearLstmBody2( - state_dim=state_dim, - feature_dim=critic_arch_feature_dim, - linear_hidden_units=critic_arch_linear_hidden_units, - linear_post_hidden_units=critic_arch_linear_post_hidden_units, - hidden_units=critic_arch_hidden_units, - non_linearities=[nn.ReLU], - gate=gate, - dropout=0.0, - add_non_lin_final_layer=True, - layer_init_fn=None, - extra_inputs_infos=extra_inputs_infos_critic_body, - ) - - - - - # TODO: remove this! We needed to relax this condition for MineRL - # assert(task.action_type == 'Discrete') - obs_shape = list(task.observation_shape) - if 'preprocessed_observation_shape' in kwargs: obs_shape = kwargs['preprocessed_observation_shape'] - goal_shape = list(task.goal_shape) - if 'preprocessed_goal_shape' in kwargs: goal_shape = kwargs['preprocessed_goal_shape'] - if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: - obs_shape[-1] = obs_shape[-1] + goal_shape[-1] - - # Selecting Extra Inputs Infos relevant to final_critic_layer: - extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) - extra_inputs_infos_final_critic_layer = {} - if extra_inputs_infos != {}: - for key in extra_inputs_infos: - shape = extra_inputs_infos[key]['shape'] - tl = extra_inputs_infos[key]['target_location'] - if 'final_critic_layer' in tl: - extra_inputs_infos_final_critic_layer[key] = { - 'shape':shape, - 'target_location':tl - } - - model = CategoricalQNet( - state_dim=obs_shape, - action_dim=task.action_dim, - phi_body=phi_body, - critic_body=critic_body, - dueling=kwargs['dueling'], - noisy=kwargs['noisy'], - goal_oriented=kwargs['goal_oriented'] if 'goal_oriented' in kwargs else False, - goal_shape=goal_shape, - goal_phi_body=goal_phi_body, - extra_inputs_infos=extra_inputs_infos_final_critic_layer - ) - - model.share_memory() - return model - - def build_DQN_Agent(task, config, agent_name): ''' :param task: Environment specific configuration @@ -980,7 +649,7 @@ def build_DQN_Agent(task, config, agent_name): loss_fn = ddqn_loss.compute_loss dqn_algorithm = DQNAlgorithm(kwargs, model, loss_fn=loss_fn) - + if 'use_HER' in kwargs and kwargs['use_HER']: from ..algorithms.wrappers import latent_based_goal_predicated_reward_fn goal_predicated_reward_fn = None @@ -993,7 +662,7 @@ def build_DQN_Agent(task, config, agent_name): agent = DQNAgent(name=agent_name, algorithm=dqn_algorithm) - if isinstance(getattr(task.env, 'observation_space', None), Dict) or ('use_HER' in kwargs and kwargs['use_HER']): + if isinstance(getattr(task.env, 'observation_space', None), gymDict) or ('use_HER' in kwargs and kwargs['use_HER']): agent = DictHandlingAgentWrapper(agent=agent, use_achieved_goal=kwargs['use_HER']) print(dqn_algorithm.get_models()) diff --git a/regym/rl_algorithms/agents/dqn_her_agent.py b/regym/rl_algorithms/agents/dqn_her_agent.py new file mode 100644 index 00000000..08521c5a --- /dev/null +++ b/regym/rl_algorithms/agents/dqn_her_agent.py @@ -0,0 +1,708 @@ +from typing import Dict, Any +import torch +import numpy as np +from copy import deepcopy +import random +from collections.abc import Iterable + +from ..algorithms.DQN import DQNAlgorithm, dqn_loss, ddqn_loss +from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction +from regym.rl_algorithms.agents.utils import generate_model, parse_and_check + +import ray +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from functools import partial + +from .agent import Agent +#from .wrappers import DictHandlingAgentWrapper +from regym.rl_algorithms.agents.agent import ExtraInputsHandlingAgent +from gym.spaces import Dict as gymDict +#from ..algorithms.wrappers import HERAlgorithmWrapper +from regym.rl_algorithms.algorithms.wrappers import HERAlgorithmWrapper2 + +from regym.rl_algorithms.utils import _extract_from_rnn_states, copy_hdict +from regym.rl_algorithms.utils import apply_on_hdict, _concatenate_list_hdict + + +#class DQNHERAgent(Agent): +class DQNHERAgent(ExtraInputsHandlingAgent): + def __init__(self, name, algorithm, extra_inputs_infos): + super(DQNHERAgent, self).__init__( + name=name, + algorithm=algorithm, + extra_inputs_infos=extra_inputs_infos + ) + + self.kwargs = algorithm.kwargs + self.epsend = float(self.kwargs['epsend']) + self.epsstart = float(self.kwargs['epsstart']) + self.epsdecay = float(self.kwargs['epsdecay']) + self.epsdecay_strategy = self.kwargs['epsdecay_strategy'] if 'epsdecay_strategy' in self.kwargs else 'exponential' + self.eps = None + + self.replay_period = int(self.kwargs['replay_period']) if 'replay_period' in self.kwargs else 1 + self.replay_period_count = 0 + + self.nbr_episode_per_cycle = int(self.kwargs['nbr_episode_per_cycle']) if 'nbr_episode_per_cycle' in self.kwargs else None + self.nbr_episode_per_cycle_count = 0 + + self.nbr_training_iteration_per_cycle = int(self.kwargs['nbr_training_iteration_per_cycle']) if 'nbr_training_iteration_per_cycle' in self.kwargs else 1 + + self.noisy = self.kwargs['noisy'] if 'noisy' in self.kwargs else False + + # Number of interaction/step with/in the environment: + self.nbr_steps = 0 + + self.saving_interval = float(self.kwargs['saving_interval']) if 'saving_interval' in self.kwargs else 1e5 + + self.previous_save_quotient = -1 + + def get_update_count(self): + return self.algorithm.unwrapped.get_update_count() + + #def handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None, prediction=None): + def _handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None, prediction=None): + ''' + Note: the batch size may differ from the nbr_actor as soon as some + actors' episodes end before the others... + + :param s: numpy tensor of states of shape batch x state_shape. + :param a: numpy tensor of actions of shape batch x action_shape. + :param r: numpy tensor of rewards of shape batch x reward_shape. + :param succ_s: numpy tensor of successive states of shape batch x state_shape. + :param done: list of boolean (batch=nbr_actor) x state_shape. + :param goals: Dictionnary of goals 'achieved_goal' and 'desired_goal' for each state 's' and 'succ_s'. + :param infos: Dictionnary of information from the environment. + :param prediction: Dictionnary of tensors containing the model's output at the current state. + ''' + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + a = a["action"] + + if prediction is None: prediction = deepcopy(self.current_prediction) + + state, r, succ_state, non_terminal = self.preprocess_environment_signals(s, r, succ_s, done) + a = torch.from_numpy(a) + # batch x ... + + batch_size = a.shape[0] + + if "vdn" in self.kwargs \ + and self.kwargs["vdn"]: + # Add a player dimension to each element: + # Assume inputs have shape : [batch_size*nbr_players, ...], + # i.e. [batch_for_p0; batch_for_p1, ...] + nbr_players = self.kwargs["vdn_nbr_players"] + batch_size = state.shape[0] // nbr_players + + new_state = [] + for bidx in range(batch_size): + bidx_states = torch.stack( + [ + state[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_state.append(bidx_states) + state = torch.cat(new_state, dim=0) + + new_a = [] + for bidx in range(batch_size): + bidx_as = torch.stack( + [ + a[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_a.append(bidx_as) + a = torch.cat(new_a, dim=0) + + new_r = [] + for bidx in range(batch_size): + bidx_rs = torch.stack( + [ + r[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_r.append(bidx_rs) + r = torch.cat(new_r, dim=0) + + ''' + non_terminal = torch.cat([non_terminal]*2, dim=0) + new_nt = [] + for bidx in range(batch_size): + bidx_nts = torch.stack([non_terminal[pidx*batch_size+bidx].unsqueeze(0) for pidx in range(nbr_players)], dim=1) + new_nt.append(bidx_nts) + non_terminal = torch.cat(new_nt, dim=0) + ''' + + new_succ_state = [] + for bidx in range(batch_size): + bidx_succ_states = torch.stack( + [ + succ_state[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_succ_state.append(bidx_succ_states) + succ_state = torch.cat(new_succ_state, dim=0) + + # BEWARE: reshaping might not give the expected ordering due to the dimensions' ordering... + #hdict_reshape_fn = lambda x: x.reshape(batch_size, nbr_players, *x.shape[1:]) + # The above fails to capture the correct ordering: + # [ batch0=[p0_exp1, p0_exp2 ; .. ]] instead of + # [ batch0=[p0_exp1, p1_exp1 ; .. ]], if only two players are considered... + def reshape_fn(x): + new_x = [] + for bidx in range(batch_size): + bidx_x = torch.stack( + [ + x[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_x.append(bidx_x) + return torch.cat(new_x, dim=0) + + for k, t in prediction.items(): + if isinstance(t, torch.Tensor): + #prediction[k] = t.reshape(batch_size, nbr_players, *t.shape[1:]) + prediction[k] = reshape_fn(prediction[k]) + elif isinstance(t, dict): + prediction[k] = apply_on_hdict( + hdict=t, + fn=reshape_fn, #hdict_reshape_fn, + ) + else: + raise NotImplementedError + + """ + # not used... + # Infos: list of batch_size * nbr_players dictionnaries: + new_infos = [] + for bidx in range(batch_size): + bidx_infos = [infos[pidx*batch_size+bidx] for pidx in range(nbr_players)] + bidx_info = _concatenate_list_hdict( + lhds=bidx_infos, + concat_fn=partial(np.stack, axis=1), #new player dimension + preprocess_fn=(lambda x: x), + ) + new_infos.append(bidx_info) + infos = new_infos + + # Goals: + if self.goal_oriented: + raise NotImplementedError + """ + + # We assume that this function has been called directly after take_action: + # therefore the current prediction correspond to this experience. + + batch_index = -1 + done_actors_among_notdone = [] + #for actor_index in range(self.nbr_actor): + for actor_index in range(batch_size): + # If this actor is already done with its episode: + if self.previously_done_actors[actor_index]: + continue + # Otherwise, there is bookkeeping to do: + batch_index +=1 + + # Bookkeeping of the actors whose episode just ended: + if done[actor_index] and not(self.previously_done_actors[actor_index]): + done_actors_among_notdone.append(batch_index) + + exp_dict = {} + exp_dict['s'] = state[batch_index,...].unsqueeze(0) + exp_dict['a'] = a[batch_index,...].unsqueeze(0) + exp_dict['r'] = r[batch_index,...].unsqueeze(0) + exp_dict['succ_s'] = succ_state[batch_index,...].unsqueeze(0) + # Watch out for the miss-match: + # done is a list of nbr_actor booleans, + # which is not sync with batch_index, purposefully... + exp_dict['non_terminal'] = non_terminal[actor_index,...].unsqueeze(0) + # Watch out for the miss-match: + # Similarly, infos is not sync with batch_index, purposefully... + if infos is not None: + exp_dict['info'] = infos[actor_index] + + ######################################################################### + ######################################################################### + # Exctracts tensors at root level: + exp_dict.update(Agent._extract_from_prediction(prediction, batch_index)) + ######################################################################### + ######################################################################### + + + # Extracts remaining info: + if self.recurrent: + exp_dict['rnn_states'] = _extract_from_rnn_states( + prediction['rnn_states'], + batch_index, + post_process_fn=(lambda x: x.detach().cpu()) + ) + exp_dict['next_rnn_states'] = _extract_from_rnn_states( + prediction['next_rnn_states'], + batch_index, + post_process_fn=(lambda x: x.detach().cpu()) + ) + + """ + # depr : goal update + if self.goal_oriented: + exp_dict['goals'] = Agent._extract_from_hdict( + goals, + batch_index, + goal_preprocessing_fn=self.goal_preprocessing + ) + """ + + self.algorithm.store(exp_dict, actor_index=actor_index) + self.previously_done_actors[actor_index] = done[actor_index] + self.handled_experiences +=1 + + self.replay_period_count += 1 + if self.nbr_episode_per_cycle is not None: + if len(done_actors_among_notdone): + self.nbr_episode_per_cycle_count += len(done_actors_among_notdone) + + if not(self.async_actor): + self.train() + + def train(self): + nbr_updates = 0 + + period_check = self.replay_period + period_count_check = self.replay_period_count + if self.nbr_episode_per_cycle is not None: + period_check = self.nbr_episode_per_cycle + period_count_check = self.nbr_episode_per_cycle_count + + if self.training \ + and self.handled_experiences > self.kwargs['min_capacity'] \ + and self.algorithm.unwrapped.stored_experiences() > self.kwargs['min_capacity'] \ + and (period_count_check % period_check == 0 or not(self.async_actor)): + minibatch_size = self.kwargs['batch_size'] + if self.nbr_episode_per_cycle is None: + minibatch_size *= self.replay_period + else: + self.nbr_episode_per_cycle_count = 1 + + for train_it in range(self.nbr_training_iteration_per_cycle): + self.algorithm.train(minibatch_size=minibatch_size) + + nbr_updates = self.nbr_training_iteration_per_cycle + + if self.algorithm.unwrapped.summary_writer is not None: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + nbr_update_remaining = sum(actor_learner_shared_dict["models_update_required"]) + self.algorithm.unwrapped.summary_writer.add_scalar( + f'PerUpdate/ActorLearnerSynchroRemainingUpdates', + nbr_update_remaining, + self.algorithm.unwrapped.get_update_count() + ) + + # Update actor's models: + if self.async_learner\ + and (self.handled_experiences // self.actor_models_update_steps_interval) != self.previous_actor_models_update_quotient: + self.previous_actor_models_update_quotient = self.handled_experiences // self.actor_models_update_steps_interval + new_models_cpu = {k:deepcopy(m).cpu() for k,m in self.algorithm.unwrapped.get_models().items()} + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + + actor_learner_shared_dict["models"] = new_models_cpu + actor_learner_shared_dict["models_update_required"] = [True]*len(actor_learner_shared_dict["models_update_required"]) + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if self.async_learner\ + and self.save_path is not None \ + and (self.algorithm.unwrapped.get_update_count() // self.saving_interval) != self.previous_save_quotient: + self.previous_save_quotient = self.algorithm.unwrapped.get_update_count() // self.saving_interval + self.save() + + return nbr_updates + + #def take_action(self, state, infos=None, as_logit=False): + def _take_action(self, state, infos=None, as_logit=False): + if self.async_actor: + # Update the algorithm's model if needs be: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + if actor_learner_shared_dict["models_update_required"][self.async_actor_idx]: + actor_learner_shared_dict["models_update_required"][self.async_actor_idx] = False + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if "models" in actor_learner_shared_dict.keys(): + new_models = actor_learner_shared_dict["models"] + self.algorithm.unwrapped.set_models(new_models) + else: + raise NotImplementedError + + if self.training: + self.nbr_steps += state.shape[0] + self.eps = self.algorithm.unwrapped.get_epsilon(nbr_steps=self.nbr_steps, strategy=self.epsdecay_strategy) + if "vdn" in self.kwargs \ + and self.kwargs["vdn"]: + # The following will not make same values contiguous: + #self.eps = np.concatenate([self.eps]*self.kwargs["vdn_nbr_players"], axis=0) + # whereas the following will, and thus players in the same environment will explore similarly: + self.eps = np.stack([self.eps]*self.kwargs["vdn_nbr_players"], axis=-1).reshape(-1) + + + state = self.state_preprocessing(state, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + + """ + # depr : goal update + goal = None + if self.goal_oriented: + goal = self.goal_preprocessing(self.goals, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + """ + + model = self.algorithm.unwrapped.get_models()['model'] + if 'use_target_to_gather_data' in self.kwargs and self.kwargs['use_target_to_gather_data']: + model = self.algorithm.unwrapped.get_models()['target_model'] + model = model.train(mode=self.training) + + + # depr : goal update + #self.current_prediction = self.query_model(model, state, goal) + self.current_prediction = self.query_model(model, state) + + if as_logit: + return self.current_prediction['log_a'] + + # Post-process and update the rnn_states from the current prediction: + # self.rnn_states <-- self.current_prediction['next_rnn_states'] + # WARNING: _post_process affects self.rnn_states. It is imperative to + # manipulate a copy of it outside of the agent's manipulation, e.g. + # when feeding it to the models. + self.current_prediction = self._post_process(self.current_prediction) + + greedy_action = self.current_prediction['a'].reshape((-1,1)).numpy() + + if self.noisy or not(self.training): + return greedy_action + + legal_actions = torch.ones_like(self.current_prediction['qa']) + if infos is not None\ + and 'head' in infos\ + and 'extra_inputs' in infos['head']\ + and 'legal_actions' in infos['head']['extra_inputs']: + legal_actions = infos['head']['extra_inputs']['legal_actions'][0] + # in case there are no legal actions for this agent in this current turn: + for actor_idx in range(legal_actions.shape[0]): + if legal_actions[actor_idx].sum() == 0: + legal_actions[actor_idx, ...] = 1 + sample = np.random.random(size=self.eps.shape) + greedy = (sample > self.eps) + greedy = np.reshape(greedy[:state.shape[0]], (state.shape[0],1)) + + #random_actions = [random.randrange(model.action_dim) for _ in range(state.shape[0])] + random_actions = [ + legal_actions[actor_idx].multinomial(num_samples=1).item() + for actor_idx in range(legal_actions.shape[0]) + ] + random_actions = np.reshape(np.array(random_actions), (state.shape[0],1)) + + actions = greedy*greedy_action + (1-greedy)*random_actions + + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + action_dict = { + 'action': actions, + 'greedy_action': greedy_action, + } + return action_dict + + return actions + + #def query_action(self, state, infos=None, as_logit=False): + def _query_action(self, state, infos=None, as_logit=False): + """ + Query's the model in training mode... + """ + if self.async_actor: + # Update the algorithm's model if needs be: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + if actor_learner_shared_dict["models_update_required"][self.async_actor_idx]: + actor_learner_shared_dict["models_update_required"][self.async_actor_idx] = False + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if "models" in actor_learner_shared_dict.keys(): + new_models = actor_learner_shared_dict["models"] + self.algorithm.unwrapped.set_models(new_models) + else: + raise NotImplementedError + + self.eps = self.algorithm.unwrapped.get_epsilon(nbr_steps=self.nbr_steps, strategy=self.epsdecay_strategy) + if "vdn" in self.kwargs \ + and self.kwargs["vdn"]: + # The following will not make same values contiguous: + #self.eps = np.concatenate([self.eps]*self.kwargs["vdn_nbr_players"], axis=0) + # whereas the following will, and thus players in the same environment will explore similarly: + self.eps = np.stack([self.eps]*self.kwargs["vdn_nbr_players"], axis=-1).reshape(-1) + + + state = self.state_preprocessing(state, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + + """ + # depr : goal update + goal = None + if self.goal_oriented: + goal = self.goal_preprocessing(self.goals, use_cuda=self.algorithm.unwrapped.kwargs['use_cuda']) + """ + + model = self.algorithm.unwrapped.get_models()['model'] + if 'use_target_to_gather_data' in self.kwargs and self.kwargs['use_target_to_gather_data']: + model = self.algorithm.unwrapped.get_models()['target_model'] + if not(model.training): model = model.train(mode=True) + + # depr : goal update + #current_prediction = self.query_model(model, state, goal) + current_prediction = self.query_model(model, state) + + if as_logit: + return current_prediction['log_a'] + + # Post-process and update the rnn_states from the current prediction: + # self.rnn_states <-- self.current_prediction['next_rnn_states'] + # WARNING: _post_process affects self.rnn_states. It is imperative to + # manipulate a copy of it outside of the agent's manipulation, e.g. + # when feeding it to the models. + current_prediction = self._post_process(current_prediction) + + greedy_action = current_prediction['a'].reshape((-1,1)).numpy() + + if self.noisy: + return greedy_action + + legal_actions = torch.ones_like(current_prediction['qa']) + if infos is not None\ + and 'head' in infos\ + and 'extra_inputs' in infos['head']\ + and 'legal_actions' in infos['head']['extra_inputs']: + legal_actions = infos['head']['extra_inputs']['legal_actions'][0] + # in case there are no legal actions for this agent in this current turn: + for actor_idx in range(legal_actions.shape[0]): + if legal_actions[actor_idx].sum() == 0: + legal_actions[actor_idx, ...] = 1 + sample = np.random.random(size=self.eps.shape) + greedy = (sample > self.eps) + greedy = np.reshape(greedy[:state.shape[0]], (state.shape[0],1)) + + #random_actions = [random.randrange(model.action_dim) for _ in range(state.shape[0])] + random_actions = [ + legal_actions[actor_idx].multinomial(num_samples=1).item() + for actor_idx in range(legal_actions.shape[0]) + ] + random_actions = np.reshape(np.array(random_actions), (state.shape[0],1)) + + actions = greedy*greedy_action + (1-greedy)*random_actions + + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + action_dict = { + 'action': actions, + 'greedy_action': greedy_action, + } + return action_dict + + return actions + + def query_model(self, model, state, goal=None): + if self.recurrent: + self._pre_process_rnn_states() + # WARNING: it is imperative to make a copy + # of the self.rnn_states, otherwise it will be + # referenced in the (self.)current_prediction + # and any subsequent update of rnn_states will + # also update the current_prediction, e.g. the call + # to _post_process in line 163 affects self.rnn_states + # and therefore might affect current_prediction's rnn_states... + rnn_states_input = copy_hdict(self.rnn_states) + current_prediction = model(state, rnn_states=rnn_states_input, goal=goal) + else: + current_prediction = model(state, goal=goal) + return current_prediction + + def clone(self, training=None, with_replay_buffer=False, clone_proxies=False, minimal=False): + cloned_algo = self.algorithm.clone( + with_replay_buffer=with_replay_buffer, + clone_proxies=clone_proxies, + minimal=minimal + ) + clone = DQNHERAgent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=self.extra_inputs_infos + ) + clone.save_path = self.save_path + + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences + clone.episode_count = self.episode_count + if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps + + # Goes through all variables 'Proxy' (dealing with multiprocessing) + # contained in this class and removes them from clone + if not(clone_proxies): + proxy_key_values = [ + (key, value) + for key, value in clone.__dict__.items() + if ('Proxy' in str(type(value))) + ] + for key, value in proxy_key_values: + setattr(clone, key, None) + + return clone + + def get_async_actor(self, training=None, with_replay_buffer=False): + self.async_learner = True + self.async_actor = False + + cloned_algo = self.algorithm.async_actor() + clone = DQNHERAgent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=self.extra_inputs_infos + ) + + clone.async_learner = False + clone.async_actor = True + + ###################################### + ###################################### + # Update actor_learner_shared_dict: + ###################################### + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + # Increase the size of the list of toggle booleans: + actor_learner_shared_dict["models_update_required"] += [False] + + # Update the (Ray)SharedVariable + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + ###################################### + # Update the async_actor index: + clone.async_actor_idx = len(actor_learner_shared_dict["models_update_required"])-1 + + ###################################### + ###################################### + + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences + clone.episode_count = self.episode_count + if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps + return clone + + +def build_DQN_HER_Agent(task, config, agent_name): + ''' + :param task: Environment specific configuration + :param config: Dict containing configuration for ppo agent + :param agent_name: name of the agent + :returns: DeepQNetworkAgent adapted to be trained on :param: task under :param: config + ''' + kwargs = config.copy() + kwargs['discount'] = float(kwargs['discount']) + kwargs['replay_capacity'] = int(float(kwargs['replay_capacity'])) + kwargs['min_capacity'] = int(float(kwargs['min_capacity'])) + + # Default preprocess function: + kwargs['state_preprocess'] = partial(PreprocessFunction, normalization=False) + kwargs['goal_preprocess'] = partial(PreprocessFunction, normalization=False) + + #if not isinstance(kwargs['observation_resize_dim'], int): kwargs['observation_resize_dim'] = task.observation_shape[0] if isinstance(task.observation_shape, tuple) else task.observation_shape + if 'observation_resize_dim' in kwargs\ + and not isinstance(kwargs['observation_resize_dim'], int): + kwargs['observation_resize_dim'] = task.observation_shape[0] if isinstance(task.observation_shape, tuple) else task.observation_shape + #if 'None' in kwargs['goal_resize_dim']: kwargs['goal_resize_dim'] = task.goal_shape[0] if isinstance(task.goal_shape, tuple) else task.goal_shape + + kwargs = parse_and_check(kwargs, task) + + model = generate_model(task, kwargs) + + loss_fn = dqn_loss.compute_loss + if kwargs['double'] or kwargs['dueling']: + loss_fn = ddqn_loss.compute_loss + + dqn_algorithm = DQNAlgorithm(kwargs, model, loss_fn=loss_fn) + + """ + if 'use_HER' in kwargs and kwargs['use_HER']: + from ..algorithms.wrappers import latent_based_goal_predicated_reward_fn + goal_predicated_reward_fn = None + if 'HER_use_latent' in kwargs and kwargs['HER_use_latent']: + goal_predicated_reward_fn = latent_based_goal_predicated_reward_fn + + dqn_algorithm = HERAlgorithmWrapper(algorithm=dqn_algorithm, + strategy=kwargs['HER_strategy'], + goal_predicated_reward_fn=goal_predicated_reward_fn) + """ + + if kwargs.get('use_HER', False): + from regym.rl_algorithms.algorithms.wrappers import latent_based_goal_predicated_reward_fn2 + goal_predicated_reward_fn = None + if kwargs.get('HER_use_latent', False): + goal_predicated_reward_fn = latent_based_goal_predicated_reward_fn2 + + dqn_algorithm = HERAlgorithmWrapper2( + algorithm=dqn_algorithm, + strategy=kwargs['HER_strategy'], + goal_predicated_reward_fn=goal_predicated_reward_fn, + extra_inputs_infos=kwargs['extra_inputs_infos'], + ) + + agent = DQNHERAgent( + name=agent_name, + algorithm=dqn_algorithm, + extra_inputs_infos=kwargs['extra_inputs_infos'], + ) + + """ + if isinstance(getattr(task.env, 'observation_space', None), gymDict) or ('use_HER' in kwargs and kwargs['use_HER']): + agent = DictHandlingAgentWrapper(agent=agent, use_achieved_goal=kwargs['use_HER']) + """ + + print(dqn_algorithm.get_models()) + + return agent diff --git a/regym/rl_algorithms/agents/ppo_agent.py b/regym/rl_algorithms/agents/ppo_agent.py index b85b25f9..c77b66bb 100755 --- a/regym/rl_algorithms/agents/ppo_agent.py +++ b/regym/rl_algorithms/agents/ppo_agent.py @@ -1,24 +1,51 @@ +from typing import Dict, Any + +import ray import torch +import torch.nn.functional as F import numpy as np +from functools import partial import copy -from .agent import Agent -from ..networks import CategoricalActorCriticNet, CategoricalActorCriticVAENet, GaussianActorCriticNet -from ..networks import FCBody, LSTMBody, GRUBody, ConvolutionalBody, BetaVAEBody, resnet18Input64, ConvolutionalGruBody -from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction from ..algorithms.PPO import PPOAlgorithm +from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction -import torch.nn.functional as F -import numpy as np -from functools import partial -from regym.rl_algorithms.utils import _extract_from_rnn_states +from regym.rl_algorithms.agents.agent import Agent, ExtraInputsHandlingAgent +from .wrappers import DictHandlingAgentWrapper +from gym.spaces import Dict as gymDict + +from regym.rl_algorithms.utils import _extract_from_rnn_states, copy_hdict +from regym.rl_algorithms.utils import apply_on_hdict, _concatenate_list_hdict +from regym.rl_algorithms.agents.utils import generate_model, parse_and_check + + +class PPOAgent(ExtraInputsHandlingAgent, Agent): + def __init__(self, name, algorithm, extra_inputs_infos): + ExtraInputsHandlingAgent.__init__( + self, + name=name, + algorithm=algorithm, + extra_inputs_infos=extra_inputs_infos + ) + + Agent.__init__( + self, + name=name, + algorithm=algorithm + ) + + self.kwargs = algorithm.kwargs -class PPOAgent(Agent): - def __init__(self, name, algorithm): - super(PPOAgent, self).__init__(name=name, algorithm=algorithm) self.use_rnd = self.algorithm.use_rnd + # Number of interaction/step with/in the environment: + self.nbr_steps = 0 + + self.saving_interval = float(self.kwargs['saving_interval']) if 'saving_interval' in self.kwargs else 1e5 + + self.previous_save_quotient = -1 + def get_experience_count(self): return self.handled_experiences @@ -32,7 +59,7 @@ def get_intrinsic_reward(self, actor_idx): else: return 0.0 - def handle_experience(self, s, a, r, succ_s, done): + def _handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None, prediction=None): ''' Note: the batch size may differ from the nbr_actor as soon as some actors' episodes end before the others... @@ -42,11 +69,136 @@ def handle_experience(self, s, a, r, succ_s, done): :param r: numpy tensor of rewards of shape batch x reward_shape. :param succ_s: numpy tensor of successive states of shape batch x state_shape. :param done: list of boolean (batch=nbr_actor) x state_shape. + :param infos: Dictionnary of information from the environment. + :param prediction: Dictionnary of tensors containing the model's output at the current state. ''' + + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + a = a["action"] + + if prediction is None: prediction = copy.deepcopy(self.current_prediction) + state, r, succ_state, non_terminal = self.preprocess_environment_signals(s, r, succ_s, done) a = torch.from_numpy(a) # batch x ... + batch_size = a.shape[0] + + if "vdn" in self.kwargs \ + and self.kwargs["vdn"]: + # Add a player dimension to each element: + # Assume inputs have shape : [batch_size*nbr_players, ...], + # i.e. [batch_for_p0; batch_for_p1, ...] + nbr_players = self.kwargs["vdn_nbr_players"] + batch_size = state.shape[0] // nbr_players + + new_state = [] + for bidx in range(batch_size): + bidx_states = torch.stack( + [ + state[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_state.append(bidx_states) + state = torch.cat(new_state, dim=0) + + new_a = [] + for bidx in range(batch_size): + bidx_as = torch.stack( + [ + a[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_a.append(bidx_as) + a = torch.cat(new_a, dim=0) + + new_r = [] + for bidx in range(batch_size): + bidx_rs = torch.stack( + [ + r[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_r.append(bidx_rs) + r = torch.cat(new_r, dim=0) + + ''' + non_terminal = torch.cat([non_terminal]*2, dim=0) + new_nt = [] + for bidx in range(batch_size): + bidx_nts = torch.stack([non_terminal[pidx*batch_size+bidx].unsqueeze(0) for pidx in range(nbr_players)], dim=1) + new_nt.append(bidx_nts) + non_terminal = torch.cat(new_nt, dim=0) + ''' + + new_succ_state = [] + for bidx in range(batch_size): + bidx_succ_states = torch.stack( + [ + succ_state[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_succ_state.append(bidx_succ_states) + succ_state = torch.cat(new_succ_state, dim=0) + + # BEWARE: reshaping might not give the expected ordering due to the dimensions' ordering... + #hdict_reshape_fn = lambda x: x.reshape(batch_size, nbr_players, *x.shape[1:]) + # The above fails to capture the correct ordering: + # [ batch0=[p0_exp1, p0_exp2 ; .. ]] instead of + # [ batch0=[p0_exp1, p1_exp1 ; .. ]], if only two players are considered... + def reshape_fn(x): + new_x = [] + for bidx in range(batch_size): + bidx_x = torch.stack( + [ + x[pidx*batch_size+bidx].unsqueeze(0) + for pidx in range(nbr_players) + ], + dim=1 + ) + new_x.append(bidx_x) + return torch.cat(new_x, dim=0) + + for k, t in prediction.items(): + if isinstance(t, torch.Tensor): + #prediction[k] = t.reshape(batch_size, nbr_players, *t.shape[1:]) + prediction[k] = reshape_fn(prediction[k]) + elif isinstance(t, dict): + prediction[k] = apply_on_hdict( + hdict=t, + fn=reshape_fn, #hdict_reshape_fn, + ) + else: + raise NotImplementedError + + """ + # not used... + # Infos: list of batch_size * nbr_players dictionnaries: + new_infos = [] + for bidx in range(batch_size): + bidx_infos = [infos[pidx*batch_size+bidx] for pidx in range(nbr_players)] + bidx_info = _concatenate_list_hdict( + lhds=bidx_infos, + concat_fn=partial(np.stack, axis=1), #new player dimension + preprocess_fn=(lambda x: x), + ) + new_infos.append(bidx_info) + infos = new_infos + + # Goals: + if self.goal_oriented: + raise NotImplementedError + """ + # We assume that this function has been called directly after take_action: # therefore the current prediction correspond to this experience. @@ -71,18 +223,36 @@ def handle_experience(self, s, a, r, succ_s, done): # Watch out for the miss-match: done is a list of nbr_actor booleans, # which is not sync with batch_index, purposefully... exp_dict['non_terminal'] = non_terminal[actor_index,...].unsqueeze(0) + # Watch out for the miss-match: + # Similarly, infos is not sync with batch_index, purposefully... + if infos is not None: + exp_dict['info'] = infos[actor_index] - exp_dict.update(Agent._extract_from_prediction(self.current_prediction, batch_index)) + ######################################################################### + ######################################################################### + # Exctracts tensors at root level: + exp_dict.update(Agent._extract_from_prediction(prediction, batch_index)) + ######################################################################### + ######################################################################### if self.use_rnd: int_reward, target_int_f = self.algorithm.compute_intrinsic_reward(exp_dict['succ_s']) rnd_dict = {'int_r':int_reward, 'target_int_f':target_int_f} exp_dict.update(rnd_dict) + # Extracts remaining info: if self.recurrent: - exp_dict['rnn_states'] = _extract_from_rnn_states(self.current_prediction['rnn_states'],batch_index) - exp_dict['next_rnn_states'] = _extract_from_rnn_states(self.current_prediction['next_rnn_states'],batch_index) - + exp_dict['rnn_states'] = _extract_from_rnn_states( + prediction['rnn_states'], + batch_index, + post_process_fn=(lambda x: x.detach().cpu()) + ) + exp_dict['next_rnn_states'] = _extract_from_rnn_states( + prediction['next_rnn_states'], + batch_index, + post_process_fn=(lambda x: x.detach().cpu()) + ) + self.algorithm.storages[actor_index].add(exp_dict) self.previously_done_actors[actor_index] = done[actor_index] self.handled_experiences +=1 @@ -90,35 +260,212 @@ def handle_experience(self, s, a, r, succ_s, done): if len(done_actors_among_notdone): # Regularization of the agents' actors: done_actors_among_notdone.sort(reverse=True) - for batch_idx in done_actors_among_notdone: - self.update_actors(batch_idx=batch_idx) + #for batch_idx in done_actors_among_notdone: + # self.update_actors(batch_idx=batch_idx) - if self.training and self.handled_experiences % self.algorithm.kwargs['horizon']*self.nbr_actor == 0: + if not(self.async_actor): + self.train() + + #if self.training \ + #and self.handled_experiences % self.algorithm.kwargs['horizon']*self.nbr_actor == 0: + #self.algorithm.train() + #if self.save_path is not None: torch.save(self, self.save_path) + + def train(self): + nbr_updates = 0 + + if self.training \ + and self.algorithm.stored_experiences() > self.algorithm.kwargs['horizon']*self.nbr_actor: self.algorithm.train() - if self.save_path is not None: torch.save(self, self.save_path) + + if self.algorithm.summary_writer is not None: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + nbr_update_remaining = sum(actor_learner_shared_dict["models_update_required"]) + self.algorithm.summary_writer.add_scalar( + f'PerUpdate/ActorLearnerSynchroRemainingUpdates', + nbr_update_remaining, + self.algorithm.get_update_count() + ) + + # Update actor's models: + if self.async_learner\ + and (self.handled_experiences // self.actor_models_update_steps_interval) != self.previous_actor_models_update_quotient: + self.previous_actor_models_update_quotient = self.handled_experiences // self.actor_models_update_steps_interval + new_models_cpu = {k:copy.deepcopy(m).cpu() for k,m in self.algorithm.get_models().items()} + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + + actor_learner_shared_dict["models"] = new_models_cpu + actor_learner_shared_dict["models_update_required"] = [True]*len(actor_learner_shared_dict["models_update_required"]) + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if self.async_learner\ + and self.save_path is not None \ + and (self.algorithm.get_update_count() // self.saving_interval) != self.previous_save_quotient: + self.previous_save_quotient = self.algorithm.get_update_count() // self.saving_interval + self.save() + + return + + def _take_action(self, state, infos=None): + if self.async_actor: + # Update the algorithm's model if needs be: + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + if actor_learner_shared_dict["models_update_required"][self.async_actor_idx]: + actor_learner_shared_dict["models_update_required"][self.async_actor_idx] = False + + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + if "models" in actor_learner_shared_dict.keys(): + new_models = actor_learner_shared_dict["models"] + self.algorithm.set_models(new_models) + else: + raise NotImplementedError - def take_action(self, state): state = self.state_preprocessing(state, use_cuda=self.algorithm.kwargs['use_cuda']) + model = self.algorithm.get_models()['model'] + + if self.training: + self.nbr_steps += state.shape[0] + + self.current_prediction = self.query_model(model, state, goal=None) + # Post-process and update the rnn_states from the current prediction: + # self.rnn_states <-- self.current_prediction['next_rnn_states'] + # WARNING: _post_process affects self.rnn_states. It is imperative to + # manipulate a copy of it outside of the agent's manipulation, e.g. + # when feeding it to the models. + self.current_prediction = self._post_process(self.current_prediction) + + #action = self.current_prediction['a'].numpy() + actions = self.current_prediction['a'].reshape((-1,1)).numpy() + greedy_action = self.current_prediction['greedy_action'].reshape((-1,1)).numpy() + + if not(self.training): + return greedy_action + if "sad" in self.kwargs \ + and self.kwargs["sad"]: + action_dict = { + 'action': actions, + 'greedy_action': greedy_action, + } + return action_dict + + return actions + + def query_model(self, model, state, goal): + if goal is not None: + raise NotImplementedError if self.recurrent: self._pre_process_rnn_states() - self.current_prediction = self.algorithm.model(state, rnn_states=self.rnn_states) + # WARNING: it is imperative to make a copy + # of the self.rnn_states, otherwise it will be + # referenced in the (self.)current_prediction + # and any subsequent update of rnn_states will + # also update the current_prediction, e.g. the call + # to _post_process in line 163 affects self.rnn_states + # and therefore might affect current_prediction's rnn_states... + rnn_states_input = copy_hdict(self.rnn_states) + current_prediction = model(state, rnn_states=rnn_states_input) else: - self.current_prediction = self.algorithm.model(state) - self.current_prediction = self._post_process(self.current_prediction) + current_prediction = model(state, goal=goal) + return current_prediction + + def clone(self, training=None, with_replay_buffer=False, clone_proxies=False, minimal=False): + cloned_algo = self.algorithm.clone( + with_replay_buffer=with_replay_buffer, + clone_proxies=clone_proxies, + minimal=minimal + ) + + clone = PPOAgent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=copy.deepcopy(self.extra_inputs_infos) + ) + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences + clone.episode_count = self.episode_count + if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps + + # Goes through all variables 'Proxy' (dealing with multiprocessing) + # contained in this class and removes them from clone + if not(clone_proxies): + proxy_key_values = [ + (key, value) + for key, value in clone.__dict__.items() + if ('Proxy' in str(type(value))) + ] + for key, value in proxy_key_values: + setattr(clone, key, None) - return self.current_prediction['a'].numpy() + return clone + + def get_async_actor(self, training=None, with_replay_buffer=False): + self.async_learner = True + self.async_actor = False + + cloned_algo = self.algorithm.async_actor() + clone = PPOAgent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=copy.deepcopy(self.extra_inputs_infos) + ) + + clone.async_learner = False + clone.async_actor = True - def clone(self, training=None): - clone = PPOAgent(name=self.name, algorithm=copy.deepcopy(self.algorithm)) - clone.handled_experiences = self.handled_experiences + ###################################### + ###################################### + # Update actor_learner_shared_dict: + ###################################### + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + # Increase the size of the list of toggle booleans: + actor_learner_shared_dict["models_update_required"] += [False] + + # Update the (Ray)SharedVariable + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + ###################################### + # Update the async_actor index: + clone.async_actor_idx = len(actor_learner_shared_dict["models_update_required"])-1 + + ###################################### + ###################################### + + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences clone.episode_count = self.episode_count if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps return clone -def build_PPO_Agent(task, config, agent_name): +def build_PPO_Agent_depr(task, config, agent_name): ''' :param task: Environment specific configuration :param config: Dict containing configuration for ppo agent @@ -344,3 +691,91 @@ def build_PPO_Agent(task, config, agent_name): ppo_algorithm = PPOAlgorithm(kwargs, model, target_intr_model=target_intr_model, predict_intr_model=predict_intr_model) return PPOAgent(name=agent_name, algorithm=ppo_algorithm) + + +def build_PPO_Agent(task, config, agent_name): + ''' + :param task: Environment specific configuration + :param config: Dict containing configuration for ppo agent + :param agent_name: name of the agent + :returns: PPOAgent adapted to be trained on :param: task under :param: config + ''' + kwargs = config.copy() + kwargs['discount'] = float(kwargs['discount']) + + # Default preprocess function: + kwargs['state_preprocess'] = PreprocessFunction + + + if task.action_type == 'Discrete': + if task.observation_type == 'Discrete': + head_type = "CategoricalActorCriticNet" + elif task.observation_type == 'Continuous': + if 'use_vae' in kwargs and kwargs['use_vae']: + head_type = "CategoricalActorCriticVAENet" + raise NotImplementedError + else: + head_type = "CategoricalActorCriticNet" + + if task.action_type is 'Continuous' and task.observation_type is 'Continuous': + head_type = "GaussianActorCriticNet" + + kwargs = parse_and_check(kwargs, task) + model = generate_model(task, kwargs, head_type=head_type) + + use_rnd = False + if 'use_random_network_distillation' in kwargs and kwargs['use_random_network_distillation']: + use_rnd = True + + target_intr_model = None + predict_intr_model = None + if use_rnd: + if kwargs['rnd_arch'] == 'MLP': + target_intr_model = FCBody(task.observation_shape, hidden_units=kwargs['rnd_feature_net_fc_arch_hidden_units'], gate=F.leaky_relu) + predict_intr_model = FCBody(task.observation_shape, hidden_units=kwargs['rnd_feature_net_fc_arch_hidden_units'], gate=F.leaky_relu) + elif 'CNN' in kwargs['rnd_arch']: + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['rnd_arch_channels'] + kernels = kwargs['rnd_arch_kernels'] + strides = kwargs['rnd_arch_strides'] + paddings = kwargs['rnd_arch_paddings'] + output_dim = kwargs['rnd_arch_feature_dim'] + target_intr_model = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + output_dim = (256,256,)+(output_dim,) + predict_intr_model = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + target_intr_model.share_memory() + predict_intr_model.share_memory() + + + ppo_algorithm = PPOAlgorithm( + kwargs, + model, + name=f"{agent_name}_algo", + target_intr_model=target_intr_model, + predict_intr_model=predict_intr_model + ) + + agent = PPOAgent( + name=agent_name, + algorithm=ppo_algorithm, + extra_inputs_infos=kwargs['extra_inputs_infos'], + ) + + """ + if isinstance(getattr(task.env, 'observation_space', None), gymDict): + agent = DictHandlingAgentWrapper(agent=agent, use_achieved_goal=False) + """ + + print(agent) + + return agent \ No newline at end of file diff --git a/regym/rl_algorithms/agents/r2d2_agent.py b/regym/rl_algorithms/agents/r2d2_agent.py index bb01a7c7..0ba184f4 100644 --- a/regym/rl_algorithms/agents/r2d2_agent.py +++ b/regym/rl_algorithms/agents/r2d2_agent.py @@ -6,10 +6,13 @@ import ray from regym.rl_algorithms.agents.agent import ExtraInputsHandlingAgent -from regym.rl_algorithms.agents.dqn_agent import DQNAgent, generate_model +from regym.rl_algorithms.agents.dqn_agent import DQNAgent +from regym.rl_algorithms.agents.utils import generate_model, parse_and_check from regym.rl_algorithms.algorithms.R2D2 import R2D2Algorithm from regym.rl_algorithms.networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction +from regym.rl_algorithms.algorithms.wrappers import HERAlgorithmWrapper2 + class R2D2Agent(ExtraInputsHandlingAgent, DQNAgent): def __init__(self, name, algorithm, extra_inputs_infos): @@ -26,8 +29,11 @@ def __init__(self, name, algorithm, extra_inputs_infos): algorithm=algorithm ) - def _take_action(self, state, infos=None): - return DQNAgent.take_action(self, state=state, infos=infos) + def _take_action(self, state, infos=None, as_logit=False): + return DQNAgent.take_action(self, state=state, infos=infos, as_logit=as_logit) + + def _query_action(self, state, infos=None, as_logit=False): + return DQNAgent.query_action(self, state=state, infos=infos, as_logit=as_logit) def _handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None): ''' @@ -102,6 +108,7 @@ def get_async_actor(self, training=None, with_replay_buffer=False): algorithm=cloned_algo, extra_inputs_infos=copy.deepcopy(self.extra_inputs_infos) ) + clone.save_path = self.save_path clone.async_learner = False clone.async_actor = True @@ -140,44 +147,6 @@ def get_async_actor(self, training=None, with_replay_buffer=False): return clone -def parse_and_check(kwargs: Dict, - task: 'regym.environments.Task'): - - # Extra Inputs: - kwargs['task'] = task - - extra_inputs = kwargs['extra_inputs_infos'] - for key in extra_inputs: - shape = extra_inputs[key]['shape'] - for idxdim, dimvalue in enumerate(shape): - if isinstance(dimvalue, str): - path = dimvalue.split('.') - if len(path) > 1: - pointer = kwargs - for el in path: - try: - if hasattr(pointer, el): - pointer = getattr(pointer, el) - elif el in pointer: - pointer = pointer[el] - else: - raise RuntimeError - except: - raise RuntimeError - else: - pointer = path - - try: - pointer = int(pointer) - except Exception as e: - print('Exception during parsing and checking:', e) - raise e - shape[idxdim] = pointer - - kwargs['task'] = None - - return kwargs - def build_R2D2_Agent(task: 'regym.environments.Task', config: Dict, agent_name: str): @@ -218,6 +187,20 @@ def build_R2D2_Agent(task: 'regym.environments.Task', name=f"{agent_name}_algo", ) + if kwargs.get('use_HER', False): + from regym.rl_algorithms.algorithms.wrappers import latent_based_goal_predicated_reward_fn2 + goal_predicated_reward_fn = None + if kwargs.get('HER_use_latent', False): + goal_predicated_reward_fn = latent_based_goal_predicated_reward_fn2 + + algorithm = HERAlgorithmWrapper2( + algorithm=algorithm, + strategy=kwargs['HER_strategy'], + goal_predicated_reward_fn=goal_predicated_reward_fn, + extra_inputs_infos=kwargs['extra_inputs_infos'], + ) + + agent = R2D2Agent( name=agent_name, algorithm=algorithm, diff --git a/regym/rl_algorithms/agents/r2d3_agent.py b/regym/rl_algorithms/agents/r2d3_agent.py index c51831ae..a5fa130b 100644 --- a/regym/rl_algorithms/agents/r2d3_agent.py +++ b/regym/rl_algorithms/agents/r2d3_agent.py @@ -6,7 +6,7 @@ import copy import torch -import minerl +#import minerl import regym from .dqn_agent import generate_model @@ -14,10 +14,9 @@ from ..algorithms.R2D3 import R2D3Algorithm from regym.rl_algorithms.networks import PreprocessFunction -from regym.util.minerl import get_action_set, generate_action_parser, MineRLTrajectoryBasedEnv, trajectory_based_rl_loop, get_good_demo_names +#from regym.util.minerl import get_action_set, generate_action_parser, MineRLTrajectoryBasedEnv, trajectory_based_rl_loop, get_good_demo_names from regym.util.wrappers import minerl2020_wrap_env from regym.environments.vec_env import VecEnv -from sklearn.metrics import pairwise_distances import numpy as np class R2D3Agent(R2D2Agent): @@ -71,6 +70,7 @@ def __init__(self, task_name, trajectory_names: List[str], wrapping_fn=None, act self.next_env_pointer = 0 # Next environment index to create self.envs = [] + import minerl for trajectory_name in self.trajectory_names: data_pipeline = minerl.data.make(task_name) data_iterator = data_pipeline.load_data(trajectory_name) @@ -85,11 +85,12 @@ def __call__(self, worker_id=None, seed=0): return env def action_parser(action, action_set): - true_action = action['vector'] if isinstance(action, dict) else action - dis = pairwise_distances(action_set, true_action.reshape(1, -1)) - discrete_action = np.argmin(dis, axis=0) - # (1,) - return discrete_action + from sklearn.metrics import pairwise_distances + true_action = action['vector'] if isinstance(action, dict) else action + dis = pairwise_distances(action_set, true_action.reshape(1, -1)) + discrete_action = np.argmin(dis, axis=0) + # (1,) + return discrete_action def load_demonstrations_into_replay_buffer( agent, diff --git a/regym/rl_algorithms/agents/random_agent.py b/regym/rl_algorithms/agents/random_agent.py new file mode 100644 index 00000000..83d1f5d9 --- /dev/null +++ b/regym/rl_algorithms/agents/random_agent.py @@ -0,0 +1,67 @@ +from typing import Dict, Optional, List +import random + +import gym +import numpy as np + +from regym.environments import EnvType +from .agent import Agent + + +class RandomAgent(Agent): + def __init__(self, name: str, action_space: gym.spaces.Space, action_space_dim: int): + self.name = name + self.action_space = action_space + self.action_space_dim = action_space_dim + self.recurrent = False + + def set_nbr_actor(self, n): + pass + + def reset_actors(self, indices:Optional[List]=[], init:Optional[bool]=False): + pass + + def preprocess_environment_signals(self, state, reward, succ_state, done): + pass + + def take_action(self, state, infos: List[Dict]): + legal_actions = [info['legal_actions'] for info in infos] + if legal_actions: + # Hope that legal actions is defined as a list of lists! + actions = [ + random.choice( + np.argwhere(legal_actions[i].squeeze() == 1) + ) + for i in range(len(legal_actions)) + ] + else: + actions = [self.action_space.sample() + for _ in range(len(observations))] + return actions + + def handle_experience(self, s, a, r, succ_s, done, goals=None, infos=None): + pass + + def get_async_actor(self, training=None, with_replay_buffer=False): + pass + + def clone(self): + return RandomAgent(name=self.name, action_space=self.action_space, action_space_dim=self.action_space_dim) + + def __repr__(self): + return f'{self.name}. Action space: {self.action_space}' + + +def build_Random_Agent(task, config, agent_name: str) -> RandomAgent: + ''' + Builds an agent that is able to randomly act in a task + + :param task: Task in which the agent will be able to act + :param config: Ignored, left here to keep `build_X_Agent` interface consistent + :param name: String identifier + ''' + # TODO: + if task.env_type == EnvType.SINGLE_AGENT: action_space = task.env.action_space + # Assumes all agents share same action space + else: action_space = task.env.action_space + return RandomAgent(name=agent_name, action_space=action_space, action_space_dim=task.action_dim) diff --git a/regym/rl_algorithms/agents/ther2_agent.py b/regym/rl_algorithms/agents/ther2_agent.py new file mode 100644 index 00000000..e919ebf2 --- /dev/null +++ b/regym/rl_algorithms/agents/ther2_agent.py @@ -0,0 +1,417 @@ +import torch +import numpy as np +import copy +import random + +from ..algorithms.DQN import DQNAlgorithm +from ..algorithms.THER import ther_predictor_loss +from ..algorithms.wrappers import THERAlgorithmWrapper2, predictor_based_goal_predicated_reward_fn2 + +from ..networks import CategoricalQNet, InstructionPredictor +from ..networks import FCBody, LSTMBody, GRUBody, EmbeddingRNNBody, CaptionRNNBody +from ..networks import ConvolutionalBody, BetaVAEBody, resnet18Input64, ConvolutionalGruBody, ConvolutionalLstmBody +from ..networks import NoisyLinear +from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction + +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from functools import partial + +from .dqn_her_agent import DQNHERAgent +from gym.spaces import Dict + +from ..algorithms import dqn_ther_loss +from ..algorithms import ddqn_ther_loss + +class THER2Agent(DQNHERAgent): + def clone(self, training=None, with_replay_buffer=False, clone_proxies=False, minimal=False): + cloned_algo = self.algorithm.clone( + with_replay_buffer=with_replay_buffer, + clone_proxies=clone_proxies, + minimal=minimal + ) + clone = THER2Agent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=self.extra_inputs_infos + ) + clone.save_path = self.save_path + + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences + clone.episode_count = self.episode_count + if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps + + # Goes through all variables 'Proxy' (dealing with multiprocessing) + # contained in this class and removes them from clone + if not(clone_proxies): + proxy_key_values = [ + (key, value) + for key, value in clone.__dict__.items() + if ('Proxy' in str(type(value))) + ] + for key, value in proxy_key_values: + setattr(clone, key, None) + + return clone + + def get_async_actor(self, training=None, with_replay_buffer=False): + self.async_learner = True + self.async_actor = False + + cloned_algo = self.algorithm.async_actor() + clone = THER2Agent( + name=self.name, + algorithm=cloned_algo, + extra_inputs_infos=self.extra_inputs_infos + ) + + clone.async_learner = False + clone.async_actor = True + + ###################################### + ###################################### + # Update actor_learner_shared_dict: + ###################################### + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + actor_learner_shared_dict = ray.get(self.actor_learner_shared_dict.get.remote()) + else: + actor_learner_shared_dict = self.actor_learner_shared_dict.get() + # Increase the size of the list of toggle booleans: + actor_learner_shared_dict["models_update_required"] += [False] + + # Update the (Ray)SharedVariable + if isinstance(self.actor_learner_shared_dict, ray.actor.ActorHandle): + self.actor_learner_shared_dict.set.remote(actor_learner_shared_dict) + else: + self.actor_learner_shared_dict.set(actor_learner_shared_dict) + + ###################################### + # Update the async_actor index: + clone.async_actor_idx = len(actor_learner_shared_dict["models_update_required"])-1 + + ###################################### + ###################################### + + clone.actor_learner_shared_dict = self.actor_learner_shared_dict + clone._handled_experiences = self._handled_experiences + clone.episode_count = self.episode_count + if training is not None: clone.training = training + clone.nbr_steps = self.nbr_steps + return clone + + +def build_THER2_Agent(task, config, agent_name): + ''' + :param task: Environment specific configuration + :param config: Dict containing configuration for ppo agent + :param agent_name: name of the agent + :returns: THERAgent adapted to be trained on :param: task under :param: config + ''' + + ''' + Note: Input values are not normalized as RGB values, ever! + Indeed, they are not RGB values... cf gym_miniworld doc... + ''' + + kwargs = config.copy() + kwargs['THER_predictor_learning_rate'] = float(kwargs['THER_predictor_learning_rate']) + + kwargs['discount'] = float(kwargs['discount']) + kwargs['replay_capacity'] = int(float(kwargs['replay_capacity'])) + kwargs['min_capacity'] = int(float(kwargs['min_capacity'])) + + kwargs['THER_vocabulary'] = set(kwargs['THER_vocabulary']) + kwargs['THER_max_sentence_length'] = int(kwargs['THER_max_sentence_length']) + + # Default preprocess function: + kwargs['state_preprocess'] = partial(PreprocessFunction, normalization=False) + kwargs['goal_preprocess'] = partial(PreprocessFunction, normalization=False) + + if 'None' in kwargs['observation_resize_dim']: kwargs['observation_resize_dim'] = task.observation_shape[0] if isinstance(task.observation_shape, tuple) else task.observation_shape + if 'None' in kwargs['goal_resize_dim']: kwargs['goal_resize_dim'] = task.goal_shape[0] if isinstance(task.goal_shape, tuple) else task.goal_shape + + kwargs = parse_and_check(kwargs, task) + + phi_body = None + input_dim = list(task.observation_shape) + if kwargs['goal_oriented']: + goal_input_shape = list(task.goal_shape) + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + if isinstance(input_dim, int): + input_dim = input_dim+goal_input_shape + else: + input_dim[-1] = input_dim[-1]+goal_input_shape[-1] + + if kwargs['phi_arch'] != 'None': + output_dim = kwargs['phi_arch_feature_dim'] + if kwargs['phi_arch'] == 'LSTM-RNN': + phi_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['phi_arch'] == 'GRU-RNN': + phi_body = GRUBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['phi_arch'] == 'MLP': + phi_body = FCBody(input_dim, hidden_units=(output_dim, ), gate=F.leaky_relu) + elif kwargs['phi_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=False) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['phi_arch_channels'] + kernels = kwargs['phi_arch_kernels'] + strides = kwargs['phi_arch_strides'] + paddings = kwargs['phi_arch_paddings'] + output_dim = kwargs['phi_arch_feature_dim'] + phi_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + else: + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=False) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['phi_arch_channels'] + kernels = kwargs['phi_arch_kernels'] + strides = kwargs['phi_arch_strides'] + paddings = kwargs['phi_arch_paddings'] + output_dim = kwargs['phi_arch_hidden_units'][-1] + if kwargs['phi_arch'] == 'CNN-GRU-RNN': + phi_body = ConvolutionalGruBody( + input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units'] + ) + elif kwargs['phi_arch'] == 'CNN-LSTM-RNN': + phi_body = ConvolutionalLstmBody( + input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units'] + ) + else : + raise NotImplementedError + + input_dim = output_dim + + + goal_phi_body = None + if kwargs['goal_oriented']: + goal_input_shape = task.goal_shape + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + kwargs['goal_preprocess'] = kwargs['state_preprocess'] + + if 'goal_state_shared_arch' in kwargs and kwargs['goal_state_shared_arch']: + kwargs['goal_preprocess'] = kwargs['state_preprocess'] + if 'preprocessed_observation_shape' in kwargs: + kwargs['preprocessed_goal_shape'] = kwargs['preprocessed_observation_shape'] + goal_input_shape = kwargs['preprocessed_goal_shape'] + goal_phi_body = None + + elif kwargs['goal_phi_arch'] != 'None': + output_dim = 256 + if kwargs['goal_phi_arch'] == 'EmbedLSTM': + num_layers = len(kwargs['goal_phi_arch_hidden_units']) + voc_size = task.goal_shape[0] + goal_phi_body = EmbeddingRNNBody(voc_size=voc_size, + embedding_size=kwargs['goal_phi_arch_embedding_size'], + hidden_units=kwargs['goal_phi_arch_hidden_units'], + num_layers=num_layers, + gate=F.relu, + dropout=0.0, + rnn_fn=nn.LSTM) + output_dim = kwargs['goal_phi_arch_hidden_units'][-1] + elif kwargs['goal_phi_arch'] == 'EmbedGRU': + num_layers = len(kwargs['goal_phi_arch_hidden_units']) + voc_size = task.goal_shape[0] + goal_phi_body = EmbeddingRNNBody(voc_size=voc_size, + embedding_size=kwargs['goal_phi_arch_embedding_size'], + hidden_units=kwargs['goal_phi_arch_hidden_units'], + num_layers=num_layers, + gate=F.relu, + dropout=0.0, + rnn_fn=nn.GRU) + output_dim = kwargs['goal_phi_arch_hidden_units'][-1] + elif kwargs['goal_phi_arch'] == 'MLP': + goal_phi_body = FCBody(goal_input_shape, hidden_units=kwargs['goal_phi_arch_hidden_units'], gate=F.leaky_relu) + elif kwargs['goal_phi_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_goal_shape'] + channels = [goal_shape[0]] + kwargs['goal_phi_arch_channels'] + kernels = kwargs['goal_phi_arch_kernels'] + strides = kwargs['goal_phi_arch_strides'] + paddings = kwargs['goal_phi_arch_paddings'] + output_dim = kwargs['goal_phi_arch_feature_dim'] + goal_phi_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + input_dim += output_dim + + + critic_body = None + layer_fn = nn.Linear + if kwargs['noisy']: layer_fn = NoisyLinear + if kwargs['critic_arch'] != 'None': + output_dim = 256 + if kwargs['critic_arch'] == 'RNN': + critic_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['critic_arch'] == 'MLP': + hidden_units=(output_dim,) + if 'critic_arch_hidden_units' in kwargs: + hidden_units = tuple(kwargs['critic_arch_hidden_units']) + critic_body = FCBody(input_dim, hidden_units=hidden_units, gate=F.leaky_relu, layer_fn=layer_fn) + elif kwargs['critic_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['critic_arch_channels'] + kernels = kwargs['critic_arch_kernels'] + strides = kwargs['critic_arch_strides'] + paddings = kwargs['critic_arch_paddings'] + output_dim = kwargs['critic_arch_feature_dim'] + critic_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + + + assert(task.action_type == 'Discrete') + + obs_shape = list(task.observation_shape) + if 'preprocessed_observation_shape' in kwargs: obs_shape = kwargs['preprocessed_observation_shape'] + goal_shape = list(task.goal_shape) + if 'preprocessed_goal_shape' in kwargs: goal_shape = kwargs['preprocessed_goal_shape'] + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + obs_shape[-1] = obs_shape[-1] + goal_shape[-1] + model = CategoricalQNet(state_dim=obs_shape, + action_dim=task.action_dim, + phi_body=phi_body, + critic_body=critic_body, + dueling=kwargs['dueling'], + noisy=kwargs['noisy'], + goal_oriented=kwargs['goal_oriented'], + goal_shape=goal_shape, + goal_phi_body=goal_phi_body) + + model.share_memory() + + + predictor_input_dim = task.observation_shape + if 'preprocessed_observation_shape' in kwargs: predictor_input_dim = list(reversed(kwargs['preprocessed_observation_shape'])) + + if kwargs['predictor_encoder_arch'] == 'LSTM-RNN': + predictor_encoder = LSTMBody(predictor_input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['predictor_encoder_arch'] == 'GRU-RNN': + predictor_encoder = GRUBody(predictor_input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['predictor_encoder_arch'] == 'MLP': + predictor_encoder = FCBody(predictor_input_dim, hidden_units=(output_dim, ), gate=F.leaky_relu) + elif kwargs['predictor_encoder_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=False) + kwargs['preprocessed_observation_shape'] = [predictor_input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + + if kwargs['THER_predictor_policy_shared_phi']: + predictor_encoder = phi_body.cnn_body + output_dim = predictor_encoder.get_feature_shape() + assert( output_dim == kwargs['predictor_decoder_arch_hidden_units'][-1]) + else: + channels = [input_shape[0]] + kwargs['predictor_encoder_arch_channels'] + kernels = kwargs['predictor_encoder_arch_kernels'] + strides = kwargs['predictor_encoder_arch_strides'] + paddings = kwargs['predictor_encoder_arch_paddings'] + output_dim = kwargs['predictor_encoder_arch_feature_dim'] + predictor_encoder = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + + predictor_decoder = CaptionRNNBody( + vocabulary=kwargs['THER_vocabulary'], + max_sentence_length=kwargs['THER_max_sentence_length'], + embedding_size=kwargs['predictor_decoder_embedding_size'], + hidden_units=kwargs['predictor_decoder_arch_hidden_units'], + num_layers=1, + gate=F.relu, + dropout=0.0, + rnn_fn=nn.GRU + ) + predictor_decoder.share_memory() + + predictor = InstructionPredictor( + encoder=predictor_encoder, + decoder=predictor_decoder + ) + predictor.share_memory() + + loss_fn = dqn_ther_loss.compute_loss + if kwargs['double'] or kwargs['dueling']: + loss_fn = ddqn_ther_loss.compute_loss + + dqn_algorithm = DQNAlgorithm( + kwargs=kwargs, + model=model, + loss_fn=loss_fn, + ) + + assert('use_HER' in kwargs and kwargs['use_HER']) + + goal_predicated_reward_fn = None + if 'HER_use_latent' in kwargs and kwargs['HER_use_latent']: + from ..algorithms.wrappers import latent_based_goal_predicated_reward_fn2 + goal_predicated_reward_fn = latent_based_goal_predicated_reward_fn2 + + if 'THER_use_predictor' in kwargs and kwargs['THER_use_predictor']: + goal_predicated_reward_fn = partial(predictor_based_goal_predicated_reward_fn2, predictor=predictor) + + ther_algorithm = THERAlgorithmWrapper2( + algorithm=dqn_algorithm, + predictor=predictor, + predictor_loss_fn=ther_predictor_loss.compute_loss, + strategy=kwargs['HER_strategy'], + goal_predicated_reward_fn=goal_predicated_reward_fn + ) + + agent = THER2Agent( + name=agent_name, + algorithm=ther_algorithm, + extra_inputs_infos=kwargs['extra_inputs_infos'], + ) + + print(ther_algorithm.get_models()) + + return agent \ No newline at end of file diff --git a/regym/rl_algorithms/agents/ther_agent.py b/regym/rl_algorithms/agents/ther_agent.py index e77d9f1b..a73da8ec 100644 --- a/regym/rl_algorithms/agents/ther_agent.py +++ b/regym/rl_algorithms/agents/ther_agent.py @@ -105,27 +105,7 @@ def build_THER_Agent(task, config, agent_name): kernel_sizes=kernels, strides=strides, paddings=paddings) - elif kwargs['phi_arch'] == 'CNN-GRU-RNN': - # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: - #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) - kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=False) - kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] - if 'nbr_frame_stacking' in kwargs: - kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] - input_shape = kwargs['preprocessed_observation_shape'] - channels = [input_shape[0]] + kwargs['phi_arch_channels'] - kernels = kwargs['phi_arch_kernels'] - strides = kwargs['phi_arch_strides'] - paddings = kwargs['phi_arch_paddings'] - output_dim = kwargs['phi_arch_hidden_units'][-1] - phi_body = ConvolutionalGruBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings, - hidden_units=kwargs['phi_arch_hidden_units']) - elif kwargs['phi_arch'] == 'CNN-LSTM-RNN': + else: # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=False) @@ -138,13 +118,29 @@ def build_THER_Agent(task, config, agent_name): strides = kwargs['phi_arch_strides'] paddings = kwargs['phi_arch_paddings'] output_dim = kwargs['phi_arch_hidden_units'][-1] - phi_body = ConvolutionalLstmBody(input_shape=input_shape, - feature_dim=output_dim, - channels=channels, - kernel_sizes=kernels, - strides=strides, - paddings=paddings, - hidden_units=kwargs['phi_arch_hidden_units']) + if kwargs['phi_arch'] == 'CNN-GRU-RNN': + phi_body = ConvolutionalGruBody( + input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units'] + ) + elif kwargs['phi_arch'] == 'CNN-LSTM-RNN': + phi_body = ConvolutionalLstmBody( + input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units'] + ) + else : + raise NotImplementedError + input_dim = output_dim @@ -297,30 +293,39 @@ def build_THER_Agent(task, config, agent_name): strides=strides, paddings=paddings) - predictor_decoder = CaptionRNNBody(vocabulary=kwargs['THER_vocabulary'], - max_sentence_length=kwargs['THER_max_sentence_length'], - embedding_size=kwargs['predictor_decoder_embedding_size'], - hidden_units=kwargs['predictor_decoder_arch_hidden_units'], - num_layers=1, - gate=F.relu, - dropout=0.0, - rnn_fn=nn.GRU) - - predictor = InstructionPredictor(encoder=predictor_encoder, decoder=predictor_decoder) - + predictor_decoder = CaptionRNNBody( + vocabulary=kwargs['THER_vocabulary'], + max_sentence_length=kwargs['THER_max_sentence_length'], + embedding_size=kwargs['predictor_decoder_embedding_size'], + hidden_units=kwargs['predictor_decoder_arch_hidden_units'], + num_layers=1, + gate=F.relu, + dropout=0.0, + rnn_fn=nn.GRU + ) + predictor_decoder.share_memory() + + predictor = InstructionPredictor( + encoder=predictor_encoder, + decoder=predictor_decoder + ) predictor.share_memory() loss_fn = dqn_ther_loss.compute_loss if kwargs['double'] or kwargs['dueling']: loss_fn = ddqn_ther_loss.compute_loss - dqn_algorithm = DQNAlgorithm(kwargs, model, loss_fn=loss_fn) + dqn_algorithm = DQNAlgorithm( + kwargs=kwargs, + model=model, + loss_fn=loss_fn + ) assert('use_HER' in kwargs and kwargs['use_HER']) - from ..algorithms.wrappers import latent_based_goal_predicated_reward_fn goal_predicated_reward_fn = None if 'HER_use_latent' in kwargs and kwargs['HER_use_latent']: + from ..algorithms.wrappers import latent_based_goal_predicated_reward_fn goal_predicated_reward_fn = latent_based_goal_predicated_reward_fn if 'THER_use_predictor' in kwargs and kwargs['THER_use_predictor']: diff --git a/regym/rl_algorithms/agents/utils.py b/regym/rl_algorithms/agents/utils.py new file mode 100644 index 00000000..9a748ae4 --- /dev/null +++ b/regym/rl_algorithms/agents/utils.py @@ -0,0 +1,810 @@ +from typing import Dict, List + +from ..networks import CategoricalQNet, CategoricalActorCriticNet, CategoricalActorCriticVAENet, GaussianActorCriticNet +from ..networks import FCBody, FCBody2, LSTMBody, GRUBody, ConvolutionalBody, BetaVAEBody, resnet18Input64 +from ..networks import ConvolutionalGruBody, ConvolutionalLstmBody +from ..networks import LinearLinearBody, LinearLstmBody, LinearLstmBody2 +from ..networks import NoisyLinear + +from ..networks import PreprocessFunction, ResizeCNNPreprocessFunction, ResizeCNNInterpolationFunction + +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +def parse_and_check(kwargs: Dict, + task: 'regym.environments.Task'): + + # Extra Inputs: + kwargs['task'] = task + + extra_inputs = kwargs['extra_inputs_infos'] + for key in extra_inputs: + shape = extra_inputs[key]['shape'] + for idxdim, dimvalue in enumerate(shape): + if isinstance(dimvalue, str): + path = dimvalue.split('.') + if len(path) > 1: + pointer = kwargs + for el in path: + try: + if hasattr(pointer, el): + pointer = getattr(pointer, el) + elif el in pointer: + pointer = pointer[el] + else: + raise RuntimeError + except: + raise RuntimeError + else: + pointer = path + + try: + pointer = int(pointer) + except Exception as e: + print('Exception during parsing and checking:', e) + raise e + shape[idxdim] = pointer + + kwargs['task'] = None + + return kwargs + + +def generate_model( + task: 'regym.environments.Task', + kwargs: Dict, + head_type: str="CategoricalQNet") -> nn.Module: + + phi_body = None + if isinstance(task.observation_shape, int): + input_dim = task.observation_shape + else: + input_dim = list(task.observation_shape) + + """ + # To deprecate: test if breaks without... + if 'goal_oriented' in kwargs and kwargs['goal_oriented']: + goal_input_shape = list(task.goal_shape) + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + if isinstance(input_dim, int): + input_dim = input_dim+goal_input_shape + else: + input_dim[-1] = input_dim[-1]+goal_input_shape[-1] + """ + + if kwargs['phi_arch'] != 'None': + output_dim = kwargs['phi_arch_feature_dim'] + if kwargs['phi_arch'] == 'LSTM-RNN': + phi_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['phi_arch'] == 'GRU-RNN': + phi_body = GRUBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['phi_arch'] == 'MLP': + hidden_units=kwargs['phi_arch_hidden_units'] + hidden_units += [output_dim] + + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_phi_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'phi_body' in tl: + extra_inputs_infos_phi_body[key] = { + 'shape':shape, + 'target_location':tl + } + if extra_inputs_infos_phi_body == {}: + phi_body = FCBody( + input_dim, + hidden_units=hidden_units, + ) + else: + phi_body = FCBody2( + input_dim, + hidden_units=hidden_units, + extra_inputs_infos=extra_inputs_infos_phi_body + ) + + elif kwargs['phi_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + if isinstance(kwargs['observation_resize_dim'], int): + input_height, input_width = kwargs['observation_resize_dim'], kwargs['observation_resize_dim'] + else: + input_height, input_width = kwargs['observation_resize_dim'] + + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=input_height, normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], input_height, input_width] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['phi_arch_channels'] + kernels = kwargs['phi_arch_kernels'] + strides = kwargs['phi_arch_strides'] + paddings = kwargs['phi_arch_paddings'] + output_dim = kwargs['phi_arch_feature_dim'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_phi_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'phi_body' in tl: + extra_inputs_infos_phi_body[key] = { + 'shape':shape, + 'target_location':tl + } + phi_body = ConvolutionalBody( + input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + extra_inputs_infos=extra_inputs_infos_phi_body, + ) + + elif kwargs['phi_arch'] == 'ResNet18': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + output_dim = kwargs['phi_arch_feature_dim'] + phi_body = resnet18Input64(input_shape=input_shape, output_dim=output_dim) + elif kwargs['phi_arch'] == 'CNN-GRU-RNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['phi_arch_channels'] + kernels = kwargs['phi_arch_kernels'] + strides = kwargs['phi_arch_strides'] + paddings = kwargs['phi_arch_paddings'] + output_dim = kwargs['phi_arch_hidden_units'][-1] + phi_body = ConvolutionalGruBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units']) + elif kwargs['phi_arch'] == 'CNN-LSTM-RNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['phi_arch_channels'] + kernels = kwargs['phi_arch_kernels'] + strides = kwargs['phi_arch_strides'] + paddings = kwargs['phi_arch_paddings'] + output_dim = kwargs['phi_arch_feature_dim'] # TODO: figure out if this breaks anything else + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_phi_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'phi_body' in tl: + extra_inputs_infos_phi_body[key] = { + 'shape':shape, + 'target_location':tl + } + + phi_body = ConvolutionalLstmBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + extra_inputs_infos=extra_inputs_infos_phi_body, + hidden_units=kwargs['phi_arch_hidden_units']) + input_dim = output_dim + + + goal_phi_body = None + if 'goal_oriented' in kwargs and kwargs['goal_oriented']: + goal_input_shape = task.goal_shape + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + kwargs['goal_preprocess'] = kwargs['state_preprocess'] + + if 'goal_state_shared_arch' in kwargs and kwargs['goal_state_shared_arch']: + kwargs['goal_preprocess'] = kwargs['state_preprocess'] + if 'preprocessed_observation_shape' in kwargs: + kwargs['preprocessed_goal_shape'] = kwargs['preprocessed_observation_shape'] + goal_input_shape = kwargs['preprocessed_goal_shape'] + goal_phi_body = None + + elif kwargs['goal_phi_arch'] != 'None': + output_dim = 256 + if kwargs['goal_phi_arch'] == 'LSTM-RNN': + phi_body = LSTMBody(goal_input_shape, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['goal_phi_arch'] == 'GRU-RNN': + phi_body = GRUBody(goal_input_shape, hidden_units=(output_dim,), gate=F.leaky_relu) + elif kwargs['goal_phi_arch'] == 'MLP': + phi_body = FCBody(goal_input_shape, hidden_units=(output_dim, ), gate=F.leaky_relu) + elif kwargs['goal_phi_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_goal_shape'] + channels = [goal_shape[0]] + kwargs['goal_phi_arch_channels'] + kernels = kwargs['goal_phi_arch_kernels'] + strides = kwargs['goal_phi_arch_strides'] + paddings = kwargs['goal_phi_arch_paddings'] + output_dim = kwargs['goal_phi_arch_feature_dim'] + phi_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + elif kwargs['goal_phi_arch'] == 'ResNet18': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_goal_shape'] + output_dim = kwargs['goal_phi_arch_feature_dim'] + phi_body = resnet18Input64(input_shape=input_shape, output_dim=output_dim) + elif kwargs['goal_phi_arch'] == 'CNN-GRU-RNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + kwargs['goal_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['goal_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_goal_shape'] = [task.goal_shape[-1], kwargs['goal_resize_dim'], kwargs['goal_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_goal_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_goal_shape'] + channels = [input_shape[0]] + kwargs['goal_phi_arch_channels'] + kernels = kwargs['goal_phi_arch_kernels'] + strides = kwargs['goal_phi_arch_strides'] + paddings = kwargs['goal_phi_arch_paddings'] + output_dim = kwargs['goal_phi_arch_hidden_units'][-1] + goal_phi_body = ConvolutionalGruBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings, + hidden_units=kwargs['phi_arch_hidden_units']) + input_dim += output_dim + + + actor_body = None + critic_body = None + layer_fn = nn.Linear + if 'noisy' in kwargs and kwargs['noisy']: layer_fn = NoisyLinear + + if "actor_arch" in kwargs \ + and kwargs['actor_arch'] != 'None': + output_dim = 256 + if kwargs['actor_arch'] == 'LSTM-RNN' or kwargs['actor_arch'] == 'GRU-RNN': + #critic_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + state_dim = input_dim + actor_arch_hidden_units = kwargs['actor_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_actor_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'actor_body' in tl: + extra_inputs_infos_actor_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + if kwargs['actor_arch'] == 'LSTM-RNN': + actor_body = LSTMBody( + state_dim=state_dim, + hidden_units=actor_arch_hidden_units, + gate=gate, + extra_inputs_infos=extra_inputs_infos_actor_body, + ) + else: + actor_body = GRUBody( + state_dim=state_dim, + hidden_units=actor_arch_hidden_units, + gate=gate, + extra_inputs_infos=extra_inputs_infos_actor_body, + ) + elif kwargs['actor_arch'] == 'MLP': + hidden_units=(output_dim,) + if 'actor_arch_hidden_units' in kwargs: + hidden_units = list(kwargs['actor_arch_hidden_units']) + actor_body = FCBody(input_dim, hidden_units=hidden_units, gate=F.leaky_relu) + elif kwargs['actor_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['actor_arch_channels'] + kernels = kwargs['actor_arch_kernels'] + strides = kwargs['actor_arch_strides'] + paddings = kwargs['actor_arch_paddings'] + output_dim = kwargs['actor_arch_feature_dim'] + actor_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + elif kwargs['actor_arch'] == 'MLP-LSTM-RNN': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + actor_arch_feature_dim = kwargs['actor_arch_feature_dim'] + actor_arch_hidden_units = kwargs['actor_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_actor_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'actor_body' in tl: + extra_inputs_infos_actor_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + actor_body = LinearLstmBody( + state_dim=state_dim, + feature_dim=actor_arch_feature_dim, + hidden_units=actor_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=gate, + dropout=0.0, + add_non_lin_final_layer=True, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_actor_body, + ) + + elif kwargs['actor_arch'] == 'MLP-MLP-RNN': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + actor_arch_feature_dim = kwargs['actor_arch_feature_dim'] + actor_arch_hidden_units = kwargs['actor_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_actor_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'actor_body' in tl: + extra_inputs_infos_actor_body[key] = { + 'shape':shape, + 'target_location':tl + } + + actor_body = LinearLinearBody( + state_dim=state_dim, + feature_dim=actor_arch_feature_dim, + hidden_units=actor_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=F.relu, + dropout=0.0, + add_non_lin_final_layer=True, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_actor_body, + ) + elif kwargs['actor_arch'] == 'MLP-LSTM-RNN2': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + actor_arch_feature_dim = kwargs['actor_arch_feature_dim'] + actor_arch_linear_hidden_units = kwargs['actor_arch_linear_hidden_units'] + actor_arch_linear_post_hidden_units = None + if 'actor_arch_linear_post_hidden_units' in kwargs: + actor_arch_linear_post_hidden_units = kwargs['actor_arch_linear_post_hidden_units'] + actor_arch_hidden_units = kwargs['actor_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_actor_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'actor_body' in tl: + extra_inputs_infos_actor_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + use_residual_connection = False + if 'use_residual_connection' in kwargs \ + and kwargs['use_residual_connection']: + import ipdb; ipdb.set_trace() + use_residual_connection = kwargs['use_residual_connection'] + + actor_body = LinearLstmBody2( + state_dim=state_dim, + feature_dim=actor_arch_feature_dim, + linear_hidden_units=actor_arch_linear_hidden_units, + linear_post_hidden_units=actor_arch_linear_post_hidden_units, + hidden_units=actor_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=gate, + dropout=0.0, + add_non_lin_final_layer=True, + use_residual_connection=use_residual_connection, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_actor_body, + ) + + # CRITIC: + + if kwargs['critic_arch'] != 'None': + output_dim = 256 + if kwargs['critic_arch'] == 'LSTM-RNN': + #critic_body = LSTMBody(input_dim, hidden_units=(output_dim,), gate=F.leaky_relu) + state_dim = input_dim + critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_critic_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'critic_body' in tl: + extra_inputs_infos_critic_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + critic_body = LSTMBody( + state_dim=state_dim, + hidden_units=critic_arch_hidden_units, + gate=gate, + extra_inputs_infos=extra_inputs_infos_critic_body, + ) + elif kwargs['critic_arch'] == 'GRU-RNN': + state_dim = input_dim + critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_critic_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'critic_body' in tl: + extra_inputs_infos_critic_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + critic_body = GRUBody( + state_dim=state_dim, + hidden_units=critic_arch_hidden_units, + gate=gate, + extra_inputs_infos=extra_inputs_infos_critic_body, + ) + elif kwargs['critic_arch'] == 'MLP': + hidden_units=(output_dim,) + if 'critic_arch_hidden_units' in kwargs: + hidden_units = list(kwargs['critic_arch_hidden_units']) + critic_body = FCBody(input_dim, hidden_units=hidden_units, gate=F.leaky_relu) + elif kwargs['critic_arch'] == 'CNN': + # Assuming raw pixels input, the shape is dependant on the observation_resize_dim specified by the user: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + kwargs['state_preprocess'] = partial(ResizeCNNInterpolationFunction, size=kwargs['observation_resize_dim'], normalize_rgb_values=True) + kwargs['preprocessed_observation_shape'] = [input_dim[-1], kwargs['observation_resize_dim'], kwargs['observation_resize_dim']] + if 'nbr_frame_stacking' in kwargs: + kwargs['preprocessed_observation_shape'][0] *= kwargs['nbr_frame_stacking'] + input_shape = kwargs['preprocessed_observation_shape'] + channels = [input_shape[0]] + kwargs['critic_arch_channels'] + kernels = kwargs['critic_arch_kernels'] + strides = kwargs['critic_arch_strides'] + paddings = kwargs['critic_arch_paddings'] + output_dim = kwargs['critic_arch_feature_dim'] + critic_body = ConvolutionalBody(input_shape=input_shape, + feature_dim=output_dim, + channels=channels, + kernel_sizes=kernels, + strides=strides, + paddings=paddings) + elif kwargs['critic_arch'] == 'MLP-LSTM-RNN': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] + critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_critic_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'critic_body' in tl: + extra_inputs_infos_critic_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + critic_body = LinearLstmBody( + state_dim=state_dim, + feature_dim=critic_arch_feature_dim, + hidden_units=critic_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=gate, + dropout=0.0, + add_non_lin_final_layer=True, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_critic_body, + ) + + elif kwargs['critic_arch'] == 'MLP-MLP-RNN': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] + critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_critic_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'critic_body' in tl: + extra_inputs_infos_critic_body[key] = { + 'shape':shape, + 'target_location':tl + } + + critic_body = LinearLinearBody( + state_dim=state_dim, + feature_dim=critic_arch_feature_dim, + hidden_units=critic_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=F.relu, + dropout=0.0, + add_non_lin_final_layer=True, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_critic_body, + ) + elif kwargs['critic_arch'] == 'MLP-LSTM-RNN2': + # Assuming flatten input: + #kwargs['state_preprocess'] = partial(ResizeCNNPreprocessFunction, size=config['observation_resize_dim']) + state_dim = input_dim + critic_arch_feature_dim = kwargs['critic_arch_feature_dim'] + critic_arch_linear_hidden_units = kwargs['critic_arch_linear_hidden_units'] + critic_arch_linear_post_hidden_units = None + if 'critic_arch_linear_post_hidden_units' in kwargs: + critic_arch_linear_post_hidden_units = kwargs['critic_arch_linear_post_hidden_units'] + critic_arch_hidden_units = kwargs['critic_arch_hidden_units'] + + # Selecting Extra Inputs Infos relevant to phi_body: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_critic_body = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'critic_body' in tl: + extra_inputs_infos_critic_body[key] = { + 'shape':shape, + 'target_location':tl + } + + gate = None + if 'use_relu_after_rnn' in kwargs \ + and kwargs['use_relu_after_rnn']: + import ipdb; ipdb.set_trace() + gate = F.relu + + use_residual_connection = False + if 'use_residual_connection' in kwargs \ + and kwargs['use_residual_connection']: + use_residual_connection = kwargs['use_residual_connection'] + + critic_body = LinearLstmBody2( + state_dim=state_dim, + feature_dim=critic_arch_feature_dim, + linear_hidden_units=critic_arch_linear_hidden_units, + linear_post_hidden_units=critic_arch_linear_post_hidden_units, + hidden_units=critic_arch_hidden_units, + non_linearities=[nn.ReLU], + gate=gate, + dropout=0.0, + add_non_lin_final_layer=True, + use_residual_connection=use_residual_connection, + #layer_init_fn=None, + extra_inputs_infos=extra_inputs_infos_critic_body, + ) + + + use_rnd = False + if 'use_random_network_distillation' in kwargs and kwargs['use_random_network_distillation']: + use_rnd = True + + + if isinstance(task.observation_shape, int): + obs_shape = task.observation_shape + else: + obs_shape = list(task.observation_shape) + if 'preprocessed_observation_shape' in kwargs: obs_shape = kwargs['preprocessed_observation_shape'] + if isinstance(task.observation_shape, int): + goal_shape = task.goal_shape + else: + goal_shape = list(task.goal_shape) + + if 'preprocessed_goal_shape' in kwargs: goal_shape = kwargs['preprocessed_goal_shape'] + """ + # depr: goal update + if 'goal_state_flattening' in kwargs and kwargs['goal_state_flattening']: + obs_shape[-1] = obs_shape[-1] + goal_shape[-1] + """ + + # Selecting Extra Inputs Infos relevant to final_critic_layer: + extra_inputs_infos = kwargs.get('extra_inputs_infos', {}) + extra_inputs_infos_final_critic_layer = {} + extra_inputs_infos_final_actor_layer = {} + if extra_inputs_infos != {}: + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + tll = extra_inputs_infos[key]['target_location'] + if not isinstance(tll[0], list): + tll= [tll] + for tl in tll: + if 'final_critic_layer' in tl: + extra_inputs_infos_final_critic_layer[key] = { + 'shape':shape, + 'target_location':tl + } + if 'final_actor_layer' in tl: + extra_inputs_infos_final_actor_layer[key] = { + 'shape':shape, + 'target_location':tl + } + + if head_type=='CategoricalQNet': + model = CategoricalQNet( + state_dim=obs_shape, + action_dim=task.action_dim, + phi_body=phi_body, + critic_body=critic_body, + dueling=kwargs['dueling'], + noisy=kwargs['noisy'] if 'noisy' in kwargs else False, + goal_oriented=kwargs['goal_oriented'] if 'goal_oriented' in kwargs else False, + goal_shape=goal_shape, + goal_phi_body=goal_phi_body, + extra_inputs_infos=extra_inputs_infos_final_critic_layer + ) + elif head_type=="CategoricalActorCriticNet": + model = CategoricalActorCriticNet( + obs_shape, + task.action_dim, + phi_body=phi_body, + actor_body=actor_body, + critic_body=critic_body, + extra_inputs_infos={ + 'critic':extra_inputs_infos_final_critic_layer, + 'actor':extra_inputs_infos_final_actor_layer, + }, + use_intrinsic_critic=use_rnd + ) + elif head_type=="GaussianActorCriticNet": + raise NotImplementedError + # TODO: implement infos scheme ... + model = GaussianActorCriticNet( + task.observation_shape, + task.action_dim, + phi_body=phi_body, + actor_body=actor_body, + critic_body=critic_body, + extra_inputs_infos={ + 'critic':extra_inputs_infos_final_critic_layer, + 'actor':extra_inputs_infos_final_actor_layer, + }, + use_intrinsic_critic=use_rnd + ) + else: + raise NotImplementedError + + model.share_memory() + return model \ No newline at end of file diff --git a/regym/rl_algorithms/algorithms/DQN/ddqn_loss.py b/regym/rl_algorithms/algorithms/DQN/ddqn_loss.py index 89f61d86..236853a6 100644 --- a/regym/rl_algorithms/algorithms/DQN/ddqn_loss.py +++ b/regym/rl_algorithms/algorithms/DQN/ddqn_loss.py @@ -63,7 +63,9 @@ def compute_loss(states: torch.Tensor, ''' # Sample actions from the current model outputs: which is actually the training batch action 'actions' (cf bodies.py CategoricalQNet) current_actions = prediction["a"] - state_action_values_g = state_action_values.gather(dim=1, index=current_actions.unsqueeze(1)).squeeze(1) + if len(current_actions.shape) != 2: + current_actions = current_actions.unsqueeze(1) + state_action_values_g = state_action_values.gather(dim=1, index=current_actions).squeeze(1) ############################ diff --git a/regym/rl_algorithms/algorithms/DQN/dqn.py b/regym/rl_algorithms/algorithms/DQN/dqn.py index e11fdb57..0f51f911 100644 --- a/regym/rl_algorithms/algorithms/DQN/dqn.py +++ b/regym/rl_algorithms/algorithms/DQN/dqn.py @@ -6,8 +6,6 @@ from functools import partial import ray -# TODO : change every storage to use remote ray storages -import time import numpy as np import torch @@ -55,7 +53,7 @@ def __init__(self, kwargs, model, target_model=None, optimizer=None, loss_fn=dqn assert (self.use_HER and self.goal_oriented) or not(self.goal_oriented) - self.weights_decay_lambda = float(self.kwargs['weights_decay_lambda']) + self.weights_decay_lambda = float(self.kwargs['weights_decay_lambda']) if 'weights_decay_lambda' in self.kwargs else 0.0 self.weights_entropy_lambda = float(self.kwargs['weights_entropy_lambda']) if 'weights_entropy_lambda' in self.kwargs else 0.0 @@ -136,6 +134,9 @@ def __init__(self, kwargs, model, target_model=None, optimizer=None, loss_fn=dqn from regym import SharedVariable self._param_update_counter = SharedVariable(0) + def parameters(self): + return self.model.parameters() + @property def param_update_counter(self): if isinstance(self._param_update_counter, ray.actor.ActorHandle): @@ -237,7 +238,11 @@ def reset_storages(self, nbr_actor: int=None): self.storages = [] keys = ['s', 'a', 'r', 'non_terminal'] if self.recurrent: keys += ['rnn_states'] + + """ + # depr : goal update if self.goal_oriented: keys += ['g'] + """ circular_keys={'succ_s':'s'} circular_offsets={'succ_s':self.n_step} @@ -313,9 +318,11 @@ def store(self, exp_dict, actor_index=0): current_exp_dict['r'] = truncated_n_step_return else: current_exp_dict = exp_dict - + """ + # depr : goal update if self.goal_oriented and 'g' not in current_exp_dict: current_exp_dict['g'] = current_exp_dict['goals']['desired_goals']['s'] + """ if self.use_PER: init_sampling_priority = None @@ -371,9 +378,12 @@ def retrieve_values_from_storages(self, minibatch_size: int): if self.recurrent: keys += ['rnn_states', 'next_rnn_states'] + """ + # depr : goal update if self.goal_oriented: keys += ['g'] - + """ + for key in keys: fulls[key] = [] using_ray = isinstance(self.storages[0], ray.actor.ActorHandle) @@ -486,8 +496,11 @@ def optimize_model(self, minibatch_size: int, samples: Dict, optimisation_miniba # (batch_size, unroll_dim, ...) sampled_goals = None + """ + # depr : goal update if self.goal_oriented: sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + """ sampled_importanceSamplingWeights = None if self.use_PER: @@ -596,8 +609,12 @@ def compute_td_error(self, samples: Dict): # (batch_size, unroll_dim, ...) sampled_goals = None + + """ + # depr : goal update if self.goal_oriented: sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + """ sampled_importanceSamplingWeights = None # if self.use_PER: diff --git a/regym/rl_algorithms/algorithms/PPO/ppo.py b/regym/rl_algorithms/algorithms/PPO/ppo.py index 8532d6b9..6ddebe91 100644 --- a/regym/rl_algorithms/algorithms/PPO/ppo.py +++ b/regym/rl_algorithms/algorithms/PPO/ppo.py @@ -1,19 +1,41 @@ -from copy import deepcopy +from typing import Dict, List + +import copy +import time +from functools import partial + import numpy as np import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F -from ...networks import random_sample +import ray + +import regym +from ..algorithm import Algorithm +from regym.rl_algorithms.utils import _extract_rnn_states_from_batch_indices +from regym.rl_algorithms.utils import _concatenate_hdict, _concatenate_list_hdict +from ...networks import hard_update, random_sample from ...replay_buffers import Storage + from . import ppo_loss, rnd_loss, ppo_vae_loss from . import ppo_actor_loss, ppo_critic_loss + summary_writer = None -class PPOAlgorithm(): - def __init__(self, kwargs, model, optimizer=None, target_intr_model=None, predict_intr_model=None, sum_writer=None): + +class PPOAlgorithm(Algorithm): + def __init__( + self, + kwargs, + model, + optimizer=None, + target_intr_model=None, + predict_intr_model=None, + sum_writer=None, + name="ppo_algo"): ''' TODO specify which values live inside of kwargs Refer to original paper for further explanation: https://arxiv.org/pdf/1707.06347.pdf @@ -31,8 +53,13 @@ def __init__(self, kwargs, model, optimizer=None, target_intr_model=None, predic adam_eps: (float), Small Epsilon value used for ADAM optimizer. Prevents numerical instability when v^{hat} (Second momentum estimator) is near 0. model: (Pytorch nn.Module) Used to represent BOTH policy network and value network ''' - self.kwargs = deepcopy(kwargs) + super(PPOAlgorithm, self).__init__(name=name) + + self.train_request_count =0 + + self.kwargs = copy.deepcopy(kwargs) self.nbr_actor = self.kwargs['nbr_actor'] + self.use_rnd = False if target_intr_model is not None and predict_intr_model is not None: self.use_rnd = True @@ -56,7 +83,7 @@ def __init__(self, kwargs, model, optimizer=None, target_intr_model=None, predic self.ext_reward_std = 1.0 self.use_vae = False - if 'use_vae' in self.kwargs and kwargs['use_vae']: + if 'use_vae' in self.kwargs and self.kwargs['use_vae']: self.use_vae = True self.model = model @@ -72,14 +99,17 @@ def __init__(self, kwargs, model, optimizer=None, target_intr_model=None, predic if self.use_rnd: parameters = list(parameters)+list(self.predict_intr_model.parameters()) # Tuning learning rate with respect to the number of actors: # Following: https://arxiv.org/abs/1705.04862 - lr = kwargs['learning_rate'] - if kwargs['lr_account_for_nbr_actor']: + lr = self.kwargs['learning_rate'] + if self.kwargs['lr_account_for_nbr_actor']: lr *= self.nbr_actor print(f"Learning rate: {lr}") - self.optimizer = optim.Adam(parameters, lr=lr, eps=kwargs['adam_eps']) + self.optimizer = optim.Adam(parameters, lr=lr, eps=float(self.kwargs['adam_eps'])) else: self.optimizer = optimizer - self.recurrent = False + # DEPRECATED in order to allow extra_inputs infos + # stored in the rnn_states that acts as frame_states... + #self.recurrent = False + self.recurrent = True # TECHNICAL DEBT: check for recurrent property by looking at the modules in the model rather than relying on the kwargs that may contain # elements that do not concern the model trained by this algorithm, given that it is now use-able inside I2A... self.recurrent_nn_submodule_names = [hyperparameter for hyperparameter, value in self.kwargs.items() if isinstance(value, str) and 'RNN' in value] @@ -89,10 +119,48 @@ def __init__(self, kwargs, model, optimizer=None, target_intr_model=None, predic self.reset_storages() global summary_writer - summary_writer = sum_writer - self.param_update_counter = 0 - self.actor_param_update_counter = 0 - self.critic_param_update_counter = 0 + if sum_writer is not None: summary_writer = sum_writer + self.summary_writer = summary_writer + if regym.RegymManager is not None: + from regym import RaySharedVariable + try: + self._param_update_counter = ray.get_actor(f"{self.name}.param_update_counter") + except ValueError: # Name is not taken. + self._param_update_counter = RaySharedVariable.options(name=f"{self.name}.param_update_counter").remote(0) + else: + from regym import SharedVariable + self._param_update_counter = SharedVariable(0) + + @property + def param_update_counter(self): + if isinstance(self._param_update_counter, ray.actor.ActorHandle): + return ray.get(self._param_update_counter.get.remote()) + else: + return self._param_update_counter.get() + + @param_update_counter.setter + def param_update_counter(self, val): + if isinstance(self._param_update_counter, ray.actor.ActorHandle): + self._param_update_counter.set.remote(val) + else: + self._param_update_counter.set(val) + + def get_models(self): + return {'model': self.model} + + def set_models(self, models_dict): + if "model" in models_dict: + hard_update(self.model, models_dict["model"]) + + def get_nbr_actor(self): + nbr_actor = self.nbr_actor + return nbr_actor + + def set_nbr_actor(self, nbr_actor): + self.nbr_actor = nbr_actor + + def get_update_count(self): + return self.param_update_counter def reset_storages(self, nbr_actor=None): if nbr_actor is not None: @@ -100,6 +168,7 @@ def reset_storages(self, nbr_actor=None): if self.storages is not None: for storage in self.storages: storage.reset() + return self.storages = [] for i in range(self.nbr_actor): @@ -114,7 +183,25 @@ def reset_storages(self, nbr_actor=None): self.storages[-1].add_key('int_adv') self.storages[-1].add_key('target_int_f') + def stored_experiences(self): + self.train_request_count += 1 + nbr_stored_experiences = sum([len(storage) for storage in self.storages]) + + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerTrainingRequest/NbrStoredExperiences', nbr_stored_experiences, self.train_request_count) + + return nbr_stored_experiences + def train(self): + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + if summary_writer is None: + summary_writer = self.summary_writer + # Compute Returns and Advantages: for idx, storage in enumerate(self.storages): if len(storage) <= 1: continue @@ -130,38 +217,52 @@ def train(self): for ob in storage.s: self.update_obs_mean_std(ob) - states, actions, next_states, log_probs_old, returns, advantages, std_advantages, int_returns, int_advantages, std_int_advantages, target_random_features, rnn_states = self.retrieve_values_from_storages() + # states, actions, next_states, log_probs_old, returns, advantages, std_advantages, \ + # int_returns, int_advantages, std_int_advantages, \ + # target_random_features, rnn_states = self.retrieve_values_from_storages() + start = time.time() + samples = self.retrieve_values_from_storages() + end = time.time() + + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/RetrieveValuesFn', end-start, self.param_update_counter) - if self.recurrent: rnn_states = self.reformat_rnn_states(rnn_states) + #if self.recurrent: rnn_states = self.reformat_rnn_states(rnn_states) + start = time.time() for it in range(self.kwargs['optimization_epochs']): - self.optimize_model(states, actions, next_states, log_probs_old, returns, advantages, std_advantages, int_returns, int_advantages, std_int_advantages, target_random_features, rnn_states) + self.optimize_model(samples) + #self.optimize_model(states, actions, next_states, log_probs_old, returns, advantages, std_advantages, int_returns, int_advantages, std_int_advantages, target_random_features, rnn_states) + end = time.time() + + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/OptimizeModelFn', end-start, self.param_update_counter) self.reset_storages() - def reformat_rnn_states(self, rnn_states): - ''' - This function reformats the :param rnn_states: into - a dict of dict of list of batched rnn_states. - :param rnn_states: list of dict of dict of list: each element is an rnn_state where: - - the first dictionnary has the name of the recurrent module in the architecture - as keys. - - the second dictionnary has the keys 'hidden', 'cell'. - - the items of this second dictionnary are lists of actual hidden/cell states for the GRU/LSTMBody. - ''' - reformated_rnn_states = {k: {'hidden': [list()], 'cell': [list()]} for k in rnn_states[0]} - for rnn_state in rnn_states: - for k in rnn_state: - hstates, cstates = rnn_state[k]['hidden'], rnn_state[k]['cell'] - for idx_layer, (h, c) in enumerate(zip(hstates, cstates)): - reformated_rnn_states[k]['hidden'][0].append(h) - reformated_rnn_states[k]['cell'][0].append(c) - for k in reformated_rnn_states: - hstates, cstates = reformated_rnn_states[k]['hidden'], reformated_rnn_states[k]['cell'] - hstates = torch.cat(hstates[0], dim=0) - cstates = torch.cat(cstates[0], dim=0) - reformated_rnn_states[k] = {'hidden': [hstates], 'cell': [cstates]} - return reformated_rnn_states + # def reformat_rnn_states(self, rnn_states): + # ''' + # This function reformats the :param rnn_states: into + # a dict of dict of list of batched rnn_states. + # :param rnn_states: list of dict of dict of list: each element is an rnn_state where: + # - the first dictionnary has the name of the recurrent module in the architecture + # as keys. + # - the second dictionnary has the keys 'hidden', 'cell'. + # - the items of this second dictionnary are lists of actual hidden/cell states for the GRU/LSTMBody. + # ''' + # reformated_rnn_states = {k: {'hidden': [list()], 'cell': [list()]} for k in rnn_states[0]} + # for rnn_state in rnn_states: + # for k in rnn_state: + # hstates, cstates = rnn_state[k]['hidden'], rnn_state[k]['cell'] + # for idx_layer, (h, c) in enumerate(zip(hstates, cstates)): + # reformated_rnn_states[k]['hidden'][0].append(h) + # reformated_rnn_states[k]['cell'][0].append(c) + # for k in reformated_rnn_states: + # hstates, cstates = reformated_rnn_states[k]['hidden'], reformated_rnn_states[k]['cell'] + # hstates = torch.cat(hstates[0], dim=0) + # cstates = torch.cat(cstates[0], dim=0) + # reformated_rnn_states[k] = {'hidden': [hstates], 'cell': [cstates]} + # return reformated_rnn_states def normalize_ext_rewards(self, storage_idx): normalized_ext_rewards = [] @@ -263,65 +364,72 @@ def compute_int_advantages_and_int_returns(self, storage_idx, non_episodic=True) self.update_int_return_mean_std(int_returns.detach().cpu()) def retrieve_values_from_storages(self): - full_states = [] - full_actions = [] - full_log_probs_old = [] - full_returns = [] - full_advantages = [] - full_rnn_states = None - full_next_states = None - full_int_returns = None - full_int_advantages = None - full_target_random_features = None - full_std_int_advantages = None + ''' + Each storage stores in their key entries either numpy arrays or hierarchical dictionnaries of numpy arrays. + This function samples from each storage, concatenate the sampled elements on the batch dimension, + and maintains the hierarchy of dictionnaries. + ''' + keys=['s', 'a', 'log_pi_a', 'ret', 'adv'] + + fulls = {} + if self.use_rnd: - full_next_states = [] - full_int_returns = [] - full_int_advantages = [] - full_target_random_features = [] - if self.recurrent: - full_rnn_states = [] + keys += ['succ_s', 'int_ret', 'int_adv', 'target_int_f'] + if self.recurrent: + keys += ['rnn_states'] #, 'next_rnn_states'] + + """ + if self.goal_oriented: + keys += ['g'] + """ + + for key in keys: + fulls[key] = [] + for storage in self.storages: # Check that there is something in the storage - if len(storage) <= 1: continue - cat = storage.cat(['s', 'a', 'log_pi_a', 'ret', 'adv']) - states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), cat) - full_states.append(states) - full_actions.append(actions) - full_log_probs_old.append(log_probs_old) - full_returns.append(returns[:-1]) - full_advantages.append(advantages[:-1]) - # Contain next state return and dummy advantages: so the size is N+1 spots: - # not used during optimization, but necessary to compute the returns and advantages of previous states.... - if self.use_rnd: - cat = storage.cat(['succ_s', 'int_ret', 'int_adv', 'target_int_f']) - next_states, int_returns, int_advantages, target_random_features = map(lambda x: torch.cat(x, dim=0), cat) - full_next_states.append(next_states) - full_int_returns.append(int_returns[:-1]) - full_int_advantages.append(int_advantages[:-1]) - full_target_random_features.append(target_random_features) - if self.recurrent: - rnn_states = storage.cat(['rnn_states'])[0] - full_rnn_states += rnn_states - - full_states = torch.cat(full_states, dim=0) - full_actions = torch.cat(full_actions, dim=0) - full_log_probs_old = torch.cat(full_log_probs_old, dim=0) - full_returns = torch.cat(full_returns, dim=0) - full_advantages = torch.cat(full_advantages, dim=0) - full_std_advantages = self.standardize(full_advantages).squeeze() - if self.use_rnd: - full_next_states = torch.cat(full_next_states, dim=0) - full_int_returns = torch.cat(full_int_returns, dim=0) - full_int_advantages = torch.cat(full_int_advantages, dim=0) - full_target_random_features = torch.cat(full_target_random_features, dim=0) - full_std_int_advantages = self.standardize(full_int_advantages).squeeze() + storage_size = len(storage) + + if storage_size <= 1: continue + sample = storage.cat(keys) - return full_states, full_actions, full_next_states, full_log_probs_old, \ - full_returns, full_advantages, full_std_advantages, \ - full_int_returns, full_int_advantages, full_std_int_advantages, \ - full_target_random_features, full_rnn_states + values = {} + for key, value in zip(keys, sample): + #value = value.tolist() + if isinstance(value[0], dict): + value = _concatenate_list_hdict( + lhds=value, + concat_fn=partial(torch.cat, dim=0), # concatenate on the unrolling dimension (axis=1). + preprocess_fn=(lambda x:x), + ) + else: + value = torch.cat(value, dim=0) + values[key] = value + + for key, value in values.items(): + fulls[key].append(value) + + keys = list(fulls.keys()) + for key in keys: + value = fulls[key] + if len(value) >1: + if isinstance(value[0], dict): + value = _concatenate_list_hdict( + lhds=value, + concat_fn=partial(torch.cat, dim=0), # concatenate on the unrolling dimension (axis=1). + preprocess_fn=(lambda x:x), + ) + else: + value = torch.cat(value, dim=0) + else: + value = value[0] + + fulls[key] = value + if 'adv' in key: + fulls[f'std_{key}'] = self.standardize(value).squeeze() + + return fulls def standardize(self, x): stable_eps = 1e-30 @@ -412,25 +520,48 @@ def update_obs_mean_std(self, unnormalized_obs): if self.running_counter_obs >= self.update_period_obs: self.running_counter_obs = 0 - def optimize_model(self, states, actions, next_states, log_probs_old, returns, advantages, std_advantages, int_returns, int_advantages, std_int_advantages, target_random_features, rnn_states=None): + #def optimize_model(self, states, actions, next_states, log_probs_old, returns, advantages, std_advantages, int_returns, int_advantages, std_int_advantages, target_random_features, rnn_states=None): + def optimize_model(self, samples): global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + # What is this: create dictionary to store length of each part of the recurrent submodules of the current model - nbr_layers_per_rnn = None - if self.recurrent: - nbr_layers_per_rnn = {recurrent_submodule_name: len(rnn_states[recurrent_submodule_name]['hidden']) - for recurrent_submodule_name in rnn_states} + # nbr_layers_per_rnn = None + # if self.recurrent: + # nbr_layers_per_rnn = {recurrent_submodule_name: len(rnn_states[recurrent_submodule_name]['hidden']) + # for recurrent_submodule_name in rnn_states} + + start = time.time() + + states = samples['s'] + rnn_states = samples['rnn_states'] + actions = samples['a'] + log_probs_old = samples['log_pi_a'] + returns = samples['ret'] + advantages = samples['adv'] + std_advantages = samples['std_adv'] + + if self.use_rnd: + next_states = samples['succ_s'] + int_returns = samples['int_ret'] + int_advantages = samples['int_adv'] + std_int_advantages = samples['std_int_adv'] + target_random_features = samples['target_int_f'] if self.kwargs['mini_batch_size'] == 'None': - sampler = [np.arange(advantages.size(0))] + sampler = [np.arange(samples['s'].size(0))] else: - sampler = random_sample(np.arange(advantages.size(0)), self.kwargs['mini_batch_size']) + sampler = random_sample(np.arange(samples['s'].size(0)), self.kwargs['mini_batch_size']) + #sampler = random_sample(np.arange(advantages.size(0)), self.kwargs['mini_batch_size']) for batch_indices in sampler: batch_indices = torch.from_numpy(batch_indices).long() sampled_rnn_states = None if self.recurrent: - sampled_rnn_states = self.calculate_rnn_states_from_batch_indices(rnn_states, batch_indices, nbr_layers_per_rnn) + #sampled_rnn_states = self.calculate_rnn_states_from_batch_indices(rnn_states, batch_indices, nbr_layers_per_rnn) + sampled_rnn_states = _extract_rnn_states_from_batch_indices(rnn_states, batch_indices, use_cuda=self.kwargs['use_cuda']) sampled_states = states[batch_indices].cuda() if self.kwargs['use_cuda'] else states[batch_indices] sampled_actions = actions[batch_indices].cuda() if self.kwargs['use_cuda'] else actions[batch_indices] @@ -486,7 +617,7 @@ def optimize_model(self, states, actions, next_states, log_probs_old, returns, a pred_intr_model=self.predict_intr_model, intrinsic_reward_ratio=self.kwargs['rnd_loss_int_ratio'], iteration_count=self.param_update_counter, - summary_writer=summary_writer ) + summary_writer=self.summary_writer ) elif self.use_vae: loss = ppo_vae_loss.compute_loss(sampled_states, sampled_actions, @@ -501,7 +632,7 @@ def optimize_model(self, states, actions, next_states, log_probs_old, returns, a vae_weight=self.kwargs['vae_weight'], model=self.model, iteration_count=self.param_update_counter, - summary_writer=summary_writer) + summary_writer=self.summary_writer) else: loss = ppo_loss.compute_loss(sampled_states, sampled_actions, @@ -516,7 +647,7 @@ def optimize_model(self, states, actions, next_states, log_probs_old, returns, a value_weight=self.kwargs['value_weight'], model=self.model, iteration_count=self.param_update_counter, - summary_writer=summary_writer) + summary_writer=self.summary_writer) loss.backward(retain_graph=False) if self.kwargs['gradient_clip'] > 1e-3: @@ -531,16 +662,80 @@ def optimize_model(self, states, actions, next_states, log_probs_old, returns, a summary_writer.add_histogram(f"Training/{name}", param.grad.cpu(), self.param_update_counter) ''' if self.use_rnd: - summary_writer.add_scalar('Training/IntRewardMean', self.int_reward_mean.cpu().item(), self.param_update_counter) - summary_writer.add_scalar('Training/IntRewardStd', self.int_reward_std.cpu().item(), self.param_update_counter) - - def calculate_rnn_states_from_batch_indices(self, rnn_states, batch_indices, nbr_layers_per_rnn): - sampled_rnn_states = {k: {'hidden': [None]*nbr_layers_per_rnn[k], 'cell': [None]*nbr_layers_per_rnn[k]} for k in rnn_states} - for recurrent_submodule_name in sampled_rnn_states: - for idx in range(nbr_layers_per_rnn[recurrent_submodule_name]): - sampled_rnn_states[recurrent_submodule_name]['hidden'][idx] = rnn_states[recurrent_submodule_name]['hidden'][idx][batch_indices].cuda() if self.kwargs['use_cuda'] else rnn_states[recurrent_submodule_name]['hidden'][idx][batch_indices] - sampled_rnn_states[recurrent_submodule_name]['cell'][idx] = rnn_states[recurrent_submodule_name]['cell'][idx][batch_indices].cuda() if self.kwargs['use_cuda'] else rnn_states[recurrent_submodule_name]['cell'][idx][batch_indices] - return sampled_rnn_states + self.summary_writer.add_scalar('Training/IntRewardMean', self.int_reward_mean.cpu().item(), self.param_update_counter) + self.summary_writer.add_scalar('Training/IntRewardStd', self.int_reward_std.cpu().item(), self.param_update_counter) + + end = time.time() + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/OptimizationLoss', end-start, self.param_update_counter) + self.summary_writer.flush() + + return + + # def calculate_rnn_states_from_batch_indices(self, rnn_states, batch_indices, nbr_layers_per_rnn): + # sampled_rnn_states = {k: {'hidden': [None]*nbr_layers_per_rnn[k], 'cell': [None]*nbr_layers_per_rnn[k]} for k in rnn_states} + # for recurrent_submodule_name in sampled_rnn_states: + # for idx in range(nbr_layers_per_rnn[recurrent_submodule_name]): + # sampled_rnn_states[recurrent_submodule_name]['hidden'][idx] = rnn_states[recurrent_submodule_name]['hidden'][idx][batch_indices].cuda() if self.kwargs['use_cuda'] else rnn_states[recurrent_submodule_name]['hidden'][idx][batch_indices] + # sampled_rnn_states[recurrent_submodule_name]['cell'][idx] = rnn_states[recurrent_submodule_name]['cell'][idx][batch_indices].cuda() if self.kwargs['use_cuda'] else rnn_states[recurrent_submodule_name]['cell'][idx][batch_indices] + # return sampled_rnn_states + + def clone(self, with_replay_buffer: bool=False, clone_proxies: bool=False, minimal=False): + if not(with_replay_buffer): + storages = self.storages + self.storages = None + + sum_writer = self.summary_writer + self.summary_writer = None + + param_update_counter = self._param_update_counter + self._param_update_counter = None + + cloned_algo = copy.deepcopy(self) + + if not(with_replay_buffer): + self.storages = storages + + self.summary_writer = sum_writer + + self._param_update_counter = param_update_counter + cloned_algo._param_update_counter = param_update_counter + + # Goes through all variables 'Proxy' (dealing with multiprocessing) + # contained in this class and removes them from clone + if not(clone_proxies): + proxy_key_values = [ + (key, value) + for key, value in cloned_algo.__dict__.items() + if ('Proxy' in str(type(value))) + ] + for key, value in proxy_key_values: + setattr(cloned_algo, key, None) + + return cloned_algo + + def async_actor(self): + storages = self.storages + self.storages = None + + sum_writer = self.summary_writer + self.summary_writer = None + + param_update_counter = self._param_update_counter + self._param_update_counter = None + + cloned_algo = copy.deepcopy(self) + + self.storages = storages + cloned_algo.storages = storages + + self.summary_writer = sum_writer + cloned_algo.summary_writer = sum_writer + + self._param_update_counter = param_update_counter + cloned_algo._param_update_counter = param_update_counter + + return cloned_algo @staticmethod def check_mandatory_kwarg_arguments(kwargs: dict): diff --git a/regym/rl_algorithms/algorithms/R2D2/r2d2.py b/regym/rl_algorithms/algorithms/R2D2/r2d2.py index 9718bf3a..39818502 100644 --- a/regym/rl_algorithms/algorithms/R2D2/r2d2.py +++ b/regym/rl_algorithms/algorithms/R2D2/r2d2.py @@ -84,7 +84,10 @@ def reset_storages(self, nbr_actor: int=None): self.storages = [] keys = ['s', 'a', 'r', 'non_terminal'] if self.recurrent: keys += ['rnn_states'] + """ + # depr : goal update if self.goal_oriented: keys += ['g'] + """ # TODO: WARNING: rnn states can be handled that way but it is meaningless since dealing with sequences... circular_keys={'succ_s':'s'} @@ -328,8 +331,11 @@ def store(self, exp_dict, actor_index=0): else: current_exp_dict = exp_dict + """ + # depr : goal update if self.goal_oriented and 'g' not in current_exp_dict: current_exp_dict['g'] = current_exp_dict['goals']['desired_goals']['s'] + """ # Store in relevant sequence buffer: self.sequence_replay_buffers[actor_index].append(current_exp_dict) diff --git a/regym/rl_algorithms/algorithms/R2D2/r2d2_loss.py b/regym/rl_algorithms/algorithms/R2D2/r2d2_loss.py index b94ef33b..eb89c3d6 100644 --- a/regym/rl_algorithms/algorithms/R2D2/r2d2_loss.py +++ b/regym/rl_algorithms/algorithms/R2D2/r2d2_loss.py @@ -684,8 +684,8 @@ def compute_loss(states: torch.Tensor, model: torch.nn.Module, target_model: torch.nn.Module, gamma: float = 0.99, - weights_decay_lambda: float = 1.0, - weights_entropy_lambda: float = 0.1, + weights_decay_lambda: float = 0.0, + weights_entropy_lambda: float = 0.0, use_PER: bool = False, PER_beta: float = 1.0, importanceSamplingWeights: torch.Tensor = None, @@ -965,13 +965,7 @@ def vdn_fn(x): Q_Si_Ai_value = value_function_rescaling(unscaled_Q_Si_Ai_value) scaled_bellman_target_Sipn_onlineGreedyAction = value_function_rescaling(unscaled_bellman_target_Sipn_onlineGreedyAction) - ''' - # TODO: decide how to handle HER augmentation... - if HER_target_clamping: - # clip the target to [-50,0] - expected_state_action_values = torch.clamp(expected_state_action_values, -1. / (1 - gamma), 0) - ''' - + # Compute loss: # MSE ? """ @@ -980,6 +974,13 @@ def vdn_fn(x): """ # Abs: + if HER_target_clamping: + # clip the unscaled target to [-50,0] + unscaled_bellman_target_Sipn_onlineGreedyAction = torch.clamp( + unscaled_bellman_target_Sipn_onlineGreedyAction, + -1. / (1 - gamma), + 0.0 + ) td_error = torch.abs(unscaled_bellman_target_Sipn_onlineGreedyAction.detach() - unscaled_Q_Si_Ai_value) scaled_td_error = torch.abs(scaled_bellman_target_Sipn_onlineGreedyAction.detach() - Q_Si_Ai_value) assert list(td_error.shape) == [batch_size, training_length, 1] @@ -1008,14 +1009,19 @@ def vdn_fn(x): assert kwargs["r2d2_loss_masking"], "r2d2_loss_masking must be True for this test." if kwargs["r2d2_loss_masking"]: mask = torch.ones_like(diff_squared) - + """ assert kwargs['r2d2_loss_masking_n_step_regularisation'], "debugging in progress" if kwargs['r2d2_loss_masking_n_step_regularisation']: mask[:, -kwargs["n_step"]:, ...] = 0 # maybe but 1 back: mask[:,-1, ...] = (1-training_non_terminals[:,-1,...]) - + """ + # Combined: + assert kwargs['r2d2_loss_masking_n_step_regularisation'], "debugging in progress" + if kwargs['r2d2_loss_masking_n_step_regularisation']: + mask[:, -kwargs["n_step"]:, ...] = (1-training_non_terminals[:,-kwargs['n_step']:,...]) + loss_per_item = loss_per_item*mask loss = 0.5*torch.mean(diff_squared*mask)-weights_entropy_lambda*training_predictions['ent'].mean() else: diff --git a/regym/rl_algorithms/algorithms/THER/__init__.py b/regym/rl_algorithms/algorithms/THER/__init__.py index 533ee48f..68f54bca 100644 --- a/regym/rl_algorithms/algorithms/THER/__init__.py +++ b/regym/rl_algorithms/algorithms/THER/__init__.py @@ -1,3 +1,5 @@ from .dqn_ther_loss import * from .ddqn_ther_loss import * -from .ther_predictor_loss import * \ No newline at end of file +from .ther_predictor_loss import * +from .ther import THERAlgorithm +from .ther2 import THERAlgorithm2 \ No newline at end of file diff --git a/regym/rl_algorithms/algorithms/THER/ther2.py b/regym/rl_algorithms/algorithms/THER/ther2.py new file mode 100644 index 00000000..8ec34eb7 --- /dev/null +++ b/regym/rl_algorithms/algorithms/THER/ther2.py @@ -0,0 +1,786 @@ +from typing import Dict, List + +import time +import copy +from collections import deque +from functools import partial + +import ray + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F + +from . import dqn_ther_loss, ddqn_ther_loss + +from ..algorithm import Algorithm +from ...replay_buffers import PrioritizedReplayStorage, ReplayStorage +from ...networks import hard_update, random_sample +from regym.rl_algorithms.utils import _extract_rnn_states_from_batch_indices, _concatenate_hdict, _concatenate_list_hdict + + +summary_writer = None + + +class THERAlgorithm2(Algorithm): + def __init__( + self, + kwargs, + model, + predictor, + target_model=None, + optimizer=None, + sum_writer=None, + name='ther_algo' + ): + ''' + ''' + super(THERAlgorithm, self).__init__(name=name) + + self.train_request_count = 0 + + self.kwargs = copy.deepcopy(kwargs) + self.use_cuda = kwargs["use_cuda"] + self.nbr_actor = self.kwargs['nbr_actor'] + + self.double = self.kwargs['double'] + self.dueling = self.kwargs['dueling'] + self.noisy = self.kwargs['noisy'] + self.n_step = self.kwargs['n_step'] if 'n_step' in self.kwargs else 1 + + if self.n_step > 1: + self.n_step_buffers = [deque(maxlen=self.n_step) for _ in range(self.nbr_actor)] + + self.use_PER = self.kwargs['use_PER'] + + self.goal_oriented = self.kwargs['goal_oriented'] if 'goal_oriented' in self.kwargs else False + self.use_HER = self.kwargs['use_HER'] if 'use_HER' in self.kwargs else False + + self.weights_decay_lambda = float(self.kwargs['weights_decay_lambda']) + self.weights_entropy_lambda = float(self.kwargs['weights_entropy_lambda']) if 'weights_entropy_lambda' in self.kwargs else 0.0 + + self.model = model + if self.kwargs['use_cuda']: + self.model = self.model.cuda() + + if target_model is None: + target_model = copy.deepcopy(self.model) + + self.target_model = target_model + self.target_model.share_memory() + + hard_update(self.target_model, self.model) + if self.use_cuda: + self.target_model = self.target_model.cuda() + + self.predictor = predictor + + if optimizer is None: + parameters = list(self.model.parameters())+list(self.predictor.parameters()) + # Tuning learning rate with respect to the number of actors: + # Following: https://arxiv.org/abs/1705.04862 + lr = kwargs['learning_rate'] + if kwargs['lr_account_for_nbr_actor']: + lr *= self.nbr_actor + print(f"Learning rate: {lr}") + self.optimizer = optim.Adam(parameters, lr=lr, betas=(0.9,0.999), eps=kwargs['adam_eps']) + else: self.optimizer = optimizer + + self.loss_fn = dqn_loss.compute_loss + if self.double: + self.loss_fn=ddqn_loss.compute_loss, + print(f"WARNING: loss_fn is {self.loss_fn}") + + # DEPRECATED in order to allow extra_inputs infos + # stored in the rnn_states that acts as frame_states... + #self.recurrent = False + self.recurrent = True + # TECHNICAL DEBT: check for recurrent property by looking at the modules in the model rather than relying on the kwargs that may contain + # elements that do not concern the model trained by this algorithm, given that it is now use-able inside I2A... + self.recurrent_nn_submodule_names = [hyperparameter for hyperparameter, value in self.kwargs.items() if isinstance(value, str) and 'RNN' in value] + if len(self.recurrent_nn_submodule_names): self.recurrent = True + + self.storages = None + self.reset_storages() + + self.min_capacity = int(float(kwargs["min_capacity"])) + self.batch_size = int(kwargs["batch_size"]) + + self.TAU = float(self.kwargs['tau']) + self.target_update_interval = int(1.0/self.TAU) + self.target_update_count = 0 + self.GAMMA = float(kwargs["discount"]) + + """ + self.epsend = float(kwargs['epsend']) + self.epsstart = float(kwargs['epsstart']) + self.epsdecay = float(kwargs['epsdecay']) + self.eps = self.epsstart + """ + + # Eps-greedy approach blends in two different schemes, from two different papers: + # - Ape-X eps-greedy scheme, + # - DQN eps-greedy scheme + # (retrieved by setting eps_greedy_alpha=0.0, i.e. all actors have the same epsilon). + self.eps_greedy_alpha = float(kwargs['eps_greedy_alpha']) if 'eps_greedy_alpha' in kwargs else 0.0 + self.reset_epsilon() + + global summary_writer + if sum_writer is not None: summary_writer = sum_writer + self.summary_writer = summary_writer + if regym.RegymManager is not None: + from regym import RaySharedVariable + try: + self._param_update_counter = ray.get_actor(f"{self.name}.param_update_counter") + except ValueError: # Name is not taken. + self._param_update_counter = RaySharedVariable.options(name=f"{self.name}.param_update_counter").remote(0) + else: + from regym import SharedVariable + self._param_update_counter = SharedVariable(0) + + def parameters(self): + return self.model.parameters() + + @property + def param_update_counter(self): + if isinstance(self._param_update_counter, ray.actor.ActorHandle): + return ray.get(self._param_update_counter.get.remote()) + else: + return self._param_update_counter.get() + + @param_update_counter.setter + def param_update_counter(self, val): + if isinstance(self._param_update_counter, ray.actor.ActorHandle): + self._param_update_counter.set.remote(val) + else: + self._param_update_counter.set(val) + + def get_models(self): + return {'model': self.model, 'target_model': self.target_model} + + def set_models(self, models_dict): + if "model" in models_dict: + hard_update(self.model, models_dict["model"]) + if "target_model" in models_dict: + hard_update(self.target_model, models_dict["target_model"]) + + def get_nbr_actor(self): + return self.nbr_actor + + def get_update_count(self): + return self.param_update_counter + + def reset_epsilon(self): + self.epsend = self.kwargs['epsend'] + self.epsstart = self.kwargs['epsstart'] + self.epsdecay = self.kwargs['epsdecay'] + if not isinstance(self.epsend, list): self.epsend = [float(self.epsend)] + if not isinstance(self.epsstart, list): self.epsstart = [float(self.epsstart)] + if not isinstance(self.epsdecay, list): self.epsdecay = [float(self.epsdecay)] + + # Ape-X eps-greedy scheme is used to setup the missing values: + # i.e. if there is only one value specified in the yaml file, + # then the effective initialisation of the eps-greedy scheme + # will be that of the Ape-X paper. + while len(self.epsend) < self.nbr_actor: + self.epsend.append( + np.power( + self.epsend[0], + 1+self.eps_greedy_alpha*(len(self.epsend)/(self.nbr_actor-1)) + ) + ) + while len(self.epsstart) < self.nbr_actor: + self.epsstart.append( + np.power( + self.epsstart[0], + 1+self.eps_greedy_alpha*(len(self.epsstart)/(self.nbr_actor-1)) + ) + ) + + # Decaying epsilon scheme can still be applied independently of the initialisation scheme. + # e.g. setting epsend to your actural epsilon value, and epsdecay to 1, with any value of epsstart. + while len(self.epsdecay) < self.nbr_actor: self.epsdecay.append(self.epsdecay[0]) + + self.epsend = np.array(self.epsend)[:self.nbr_actor] + self.epsstart = np.array(self.epsstart)[:self.nbr_actor] + self.epsdecay = np.array(self.epsdecay)[:self.nbr_actor] + + self.eps = self.epsstart + + def get_epsilon(self, nbr_steps, strategy='exponential'): + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + + if 'exponential' in strategy: + self.eps = self.epsend + (self.epsstart-self.epsend) * np.exp(-1.0 * nbr_steps / self.epsdecay) + else: + self.eps = self.epsend + max(0, (self.epsstart-self.epsend)/((float(nbr_steps)/self.epsdecay)+1)) + + """ + if self.summary_writer is not None: + for actor_i in range(self.eps.shape[0]): + self.summary_writer.add_scalar(f'Training/Eps_Actor_{actor_i}', self.eps[actor_i], nbr_steps) + """ + return self.eps + + def reset_storages(self, nbr_actor: int=None): + if nbr_actor is not None: + self.nbr_actor = nbr_actor + + if self.n_step > 1: + self.n_step_buffers = [deque(maxlen=self.n_step) for _ in range(self.nbr_actor)] + + if self.storages is not None: + for storage in self.storages: storage.reset() + + self.storages = [] + keys = ['s', 'a', 'r', 'non_terminal'] + if self.recurrent: keys += ['rnn_states'] + + """ + # depr : goal update + if self.goal_oriented: keys += ['g'] + """ + + circular_keys={'succ_s':'s'} + circular_offsets={'succ_s':self.n_step} + if self.recurrent: + circular_keys.update({'next_rnn_states':'rnn_states'}) + circular_offsets.update({'next_rnn_states':1}) + + beta_increase_interval = None + if 'PER_beta_increase_interval' in self.kwargs and self.kwargs['PER_beta_increase_interval']!='None': + beta_increase_interval = float(self.kwargs['PER_beta_increase_interval']) + + for i in range(self.nbr_actor): + if self.kwargs['use_PER']: + self.storages.append( + PrioritizedReplayStorage( + capacity=self.kwargs['replay_capacity']//self.nbr_actor, + alpha=self.kwargs['PER_alpha'], + beta=self.kwargs['PER_beta'], + beta_increase_interval=beta_increase_interval, + keys=keys, + circular_keys=circular_keys, + circular_offsets=circular_offsets + ) + ) + else: + self.storages.append( + ReplayStorage( + capacity=self.kwargs['replay_capacity']//self.nbr_actor, + keys=keys, + circular_keys=circular_keys, + circular_offsets=circular_offsets + ) + ) + + def stored_experiences(self): + self.train_request_count += 1 + if isinstance(self.storages[0], ray.actor.ActorHandle): + nbr_stored_experiences = sum([ray.get(storage.__len__.remote()) for storage in self.storages]) + else: + nbr_stored_experiences = sum([len(storage) for storage in self.storages]) + + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerTrainingRequest/NbrStoredExperiences', nbr_stored_experiences, self.train_request_count) + + return nbr_stored_experiences + + def _compute_truncated_n_step_return(self): + truncated_n_step_return = self.n_step_buffer[-1]['r'] + for exp_dict in reversed(list(self.n_step_buffer)[:-1]): + truncated_n_step_return = exp_dict['r'] + self.GAMMA * truncated_n_step_return * exp_dict['non_terminal'] + return truncated_n_step_return + + def store(self, exp_dict, actor_index=0): + ''' + Compute n-step returns, for each actor, separately, + and then store the experience in the relevant-actor's storage. + ''' + if self.n_step>1: + # Append to deque: + self.n_step_buffers[actor_index].append(exp_dict) + if len(self.n_step_buffers[actor_index]) < self.n_step: + return + # Compute n-step return of the first element of deque: + truncated_n_step_return = self._compute_truncated_n_step_return() + # Retrieve the first element of deque: + current_exp_dict = copy.deepcopy(self.n_step_buffers[actor_index][0]) + current_exp_dict['r'] = truncated_n_step_return + else: + current_exp_dict = exp_dict + """ + # depr : goal update + if self.goal_oriented and 'g' not in current_exp_dict: + current_exp_dict['g'] = current_exp_dict['goals']['desired_goals']['s'] + """ + + if self.use_PER: + init_sampling_priority = None + self.storages[actor_index].add(current_exp_dict, priority=init_sampling_priority) + else: + self.storages[actor_index].add(current_exp_dict) + + def train(self, minibatch_size:int=None): + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + + if minibatch_size is None: minibatch_size = self.batch_size + + self.target_update_count += self.nbr_actor + + start = time.time() + samples = self.retrieve_values_from_storages(minibatch_size=minibatch_size) + end = time.time() + + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/RetrieveValuesFn', end-start, self.param_update_counter) + + + if self.noisy: + self.model.reset_noise() + self.target_model.reset_noise() + + start = time.time() + self.optimize_model(minibatch_size, samples) + end = time.time() + + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/OptimizeModelFn', end-start, self.param_update_counter) + + if self.target_update_count > self.target_update_interval: + self.target_update_count = 0 + hard_update(self.target_model,self.model) + + def retrieve_values_from_storages(self, minibatch_size: int): + ''' + Each storage stores in their key entries either numpy arrays or hierarchical dictionnaries of numpy arrays. + This function samples from each storage, concatenate the sampled elements on the batch dimension, + and maintains the hierarchy of dictionnaries. + ''' + keys=['s', 'a', 'succ_s', 'r', 'non_terminal'] + + fulls = {} + + if self.use_PER: + fulls['importanceSamplingWeights'] = [] + + if self.recurrent: + keys += ['rnn_states', 'next_rnn_states'] + + """ + # depr : goal update + if self.goal_oriented: + keys += ['g'] + """ + + for key in keys: fulls[key] = [] + + using_ray = isinstance(self.storages[0], ray.actor.ActorHandle) + for storage in self.storages: + # Check that there is something in the storage + if using_ray: + storage_size = ray.get(storage.__len__.remote()) + else: + storage_size = len(storage) + + if storage_size <= 1: continue + #if len(storage) <= 1: continue + if self.use_PER: + if using_ray: + sample, importanceSamplingWeights = ray.get( + storage.sample.remote(batch_size=minibatch_size, keys=keys) + ) + else: + sample, importanceSamplingWeights = storage.sample(batch_size=minibatch_size, keys=keys) + importanceSamplingWeights = torch.from_numpy(importanceSamplingWeights) + fulls['importanceSamplingWeights'].append(importanceSamplingWeights) + else: + sample = storage.sample(batch_size=minibatch_size, keys=keys) + + values = {} + for key, value in zip(keys, sample): + value = value.tolist() + if isinstance(value[0], dict): + value = _concatenate_list_hdict( + lhds=value, + concat_fn=partial(torch.cat, dim=0), # concatenate on the unrolling dimension (axis=1). + preprocess_fn=(lambda x:x), + ) + else: + value = torch.cat(value, dim=0) + values[key] = value + + for key, value in values.items(): + fulls[key].append(value) + + for key, value in fulls.items(): + if len(value) >1: + if isinstance(value[0], dict): + value = _concatenate_list_hdict( + lhds=value, + concat_fn=partial(torch.cat, dim=0), # concatenate on the unrolling dimension (axis=1). + preprocess_fn=(lambda x:x), + ) + else: + value = torch.cat(value, dim=0) + else: + value = value[0] + + fulls[key] = value + + return fulls + + def optimize_model(self, minibatch_size: int, samples: Dict, optimisation_minibatch_size:int=None): + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + + if optimisation_minibatch_size is None: + optimisation_minibatch_size = minibatch_size*self.nbr_actor + + start = time.time() + + #beta = self.storages[0].get_beta() if self.use_PER else 1.0 + beta = 1.0 + if self.use_PER: + if hasattr(self.storages[0].get_beta, "remote"): + beta_id = self.storages[0].get_beta.remote() + beta = ray.get(beta_id) + else: + beta = self.storages[0].get_beta() + + states = samples['s'] + actions = samples['a'] + next_states = samples['succ_s'] + rewards = samples['r'] + non_terminals = samples['non_terminal'] + + rnn_states = samples['rnn_states'] if 'rnn_states' in samples else None + next_rnn_states = samples['next_rnn_states'] if 'next_rnn_states' in samples else None + goals = samples['g'] if 'g' in samples else None + + importanceSamplingWeights = samples['importanceSamplingWeights'] if 'importanceSamplingWeights' in samples else None + + # For each actor, there is one mini_batch update: + sampler = random_sample(np.arange(states.size(0)), optimisation_minibatch_size) + list_batch_indices = [storage_idx*minibatch_size+np.arange(minibatch_size) \ + for storage_idx, _ in enumerate(self.storages)] + array_batch_indices = np.concatenate(list_batch_indices, axis=0) + sampled_batch_indices = [] + sampled_losses_per_item = [] + + for batch_indices in sampler: + batch_indices = torch.from_numpy(batch_indices).long() + sampled_batch_indices.append(batch_indices) + + sampled_rnn_states = None + sampled_next_rnn_states = None + if self.recurrent: + sampled_rnn_states, sampled_next_rnn_states = self.sample_from_rnn_states( + rnn_states, + next_rnn_states, + batch_indices, + use_cuda=self.kwargs['use_cuda'] + ) + # (batch_size, unroll_dim, ...) + + sampled_goals = None + """ + # depr : goal update + if self.goal_oriented: + sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + """ + + sampled_importanceSamplingWeights = None + if self.use_PER: + sampled_importanceSamplingWeights = importanceSamplingWeights[batch_indices].cuda() if self.kwargs['use_cuda'] else importanceSamplingWeights[batch_indices] + + sampled_states = states[batch_indices].cuda() if self.kwargs['use_cuda'] else states[batch_indices] + sampled_actions = actions[batch_indices].cuda() if self.kwargs['use_cuda'] else actions[batch_indices] + sampled_next_states = next_states[batch_indices].cuda() if self.kwargs['use_cuda'] else next_states[batch_indices] + sampled_rewards = rewards[batch_indices].cuda() if self.kwargs['use_cuda'] else rewards[batch_indices] + sampled_non_terminals = non_terminals[batch_indices].cuda() if self.kwargs['use_cuda'] else non_terminals[batch_indices] + # (batch_size, unroll_dim, ...) + + self.optimizer.zero_grad() + """ + if self.double or self.dueling: + loss, loss_per_item = ddqn_ther_loss.compute_loss(sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + rnn_states=sampled_rnn_states, + goals=sampled_goals, + gamma=self.GAMMA, + model=self.model, + predictor=self.predictor, + target_model=self.target_model, + weights_decay_lambda=self.weights_decay_lambda, + use_PER=self.use_PER, + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + use_HER=self.use_HER, + iteration_count=self.param_update_counter, + summary_writer=summary_writer) + else: + loss, loss_per_item = dqn_ther_loss.compute_loss(sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + rnn_states=sampled_rnn_states, + goals=sampled_goals, + gamma=self.GAMMA, + model=self.model, + predictor=self.predictor, + target_model=self.target_model, + weights_decay_lambda=self.weights_decay_lambda, + use_PER=self.use_PER, + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + use_HER=self.use_HER, + iteration_count=self.param_update_counter, + summary_writer=summary_writer) + """ + import ipdb; ipdb.set_trace() + loss, loss_per_item = self.loss_fn( + sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + rnn_states=sampled_rnn_states, + next_rnn_states=sampled_next_rnn_states, + goals=sampled_goals, + gamma=self.GAMMA, + model=self.model, + #predictor=self.predictor, + target_model=self.target_model, + weights_decay_lambda=self.weights_decay_lambda, + weights_entropy_lambda=self.weights_entropy_lambda, + use_PER=self.use_PER, + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + #HER_target_clamping=self.kwargs['HER_target_clamping'] if 'HER_target_clamping' in self.kwargs else False, + use_HER=self.use_HER, + iteration_count=self.param_update_counter, + summary_writer=self.summary_writer, + kwargs=self.kwargs + ) + + loss.backward(retain_graph=False) + if self.kwargs['gradient_clip'] > 1e-3: + nn.utils.clip_grad_norm_(self.model.parameters(), self.kwargs['gradient_clip']) + nn.utils.clip_grad_norm_(self.predictor.parameters(), self.kwargs['gradient_clip']) + self.optimizer.step() + + if self.use_PER: + sampled_losses_per_item.append(loss_per_item) + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/ImportanceSamplingMean', sampled_importanceSamplingWeights.cpu().mean().item(), self.param_update_counter) + self.summary_writer.add_scalar('PerUpdate/ImportanceSamplingStd', sampled_importanceSamplingWeights.cpu().std().item(), self.param_update_counter) + self.summary_writer.add_scalar('PerUpdate/PER_Beta', beta, self.param_update_counter) + + self.param_update_counter += 1 + + if self.use_PER : + sampled_batch_indices = np.concatenate(sampled_batch_indices, axis=0) + # let us align the batch indices with the losses: + array_batch_indices = array_batch_indices[sampled_batch_indices] + # Now we can iterate through the losses and retrieve what + # storage and what batch index they were associated with: + self._update_replay_buffer_priorities( + sampled_losses_per_item=sampled_losses_per_item, + array_batch_indices=array_batch_indices, + minibatch_size=minibatch_size, + ) + + end = time.time() + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/OptimizationLoss', end-start, self.param_update_counter) + self.summary_writer.flush() + + + def compute_td_error(self, samples: Dict): + global summary_writer + if self.summary_writer is None: + self.summary_writer = summary_writer + + start = time.time() + + beta = 1.0 + # if self.use_PER: + # if hasattr(self.storages[0].get_beta, "remote"): + # beta_id = self.storages[0].get_beta.remote() + # beta = ray.get(beta_id) + # else: + # beta = self.storages[0].get_beta() + + states = samples['s'] + actions = samples['a'] + next_states = samples['succ_s'] + rewards = samples['r'] + non_terminals = samples['non_terminal'] + + rnn_states = samples['rnn_states'] if 'rnn_states' in samples else None + next_rnn_states = samples['next_rnn_states'] if 'next_rnn_states' in samples else None + goals = samples['g'] if 'g' in samples else None + + #importanceSamplingWeights = samples['importanceSamplingWeights'] if 'importanceSamplingWeights' in samples else None + + batch_indices = torch.arange(states.shape[0]) + + sampled_rnn_states = None + sampled_next_rnn_states = None + if self.recurrent: + sampled_rnn_states, sampled_next_rnn_states = self.sample_from_rnn_states( + rnn_states, + next_rnn_states, + batch_indices, + use_cuda=self.kwargs['use_cuda'] + ) + # (batch_size, unroll_dim, ...) + + sampled_goals = None + + """ + # depr : goal update + if self.goal_oriented: + sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + """ + + sampled_importanceSamplingWeights = None + # if self.use_PER: + # sampled_importanceSamplingWeights = importanceSamplingWeights[batch_indices].cuda() if self.kwargs['use_cuda'] else importanceSamplingWeights[batch_indices] + + sampled_states = states[batch_indices].cuda() if self.kwargs['use_cuda'] else states[batch_indices] + sampled_actions = actions[batch_indices].cuda() if self.kwargs['use_cuda'] else actions[batch_indices] + sampled_next_states = next_states[batch_indices].cuda() if self.kwargs['use_cuda'] else next_states[batch_indices] + sampled_rewards = rewards[batch_indices].cuda() if self.kwargs['use_cuda'] else rewards[batch_indices] + sampled_non_terminals = non_terminals[batch_indices].cuda() if self.kwargs['use_cuda'] else non_terminals[batch_indices] + # (batch_size, unroll_dim, ...) + + loss, loss_per_item = self.loss_fn( + sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + rnn_states=sampled_rnn_states, + next_rnn_states=sampled_next_rnn_states, + goals=sampled_goals, + gamma=self.GAMMA, + model=self.model, + target_model=self.target_model, + weights_decay_lambda=self.weights_decay_lambda, + weights_entropy_lambda=self.weights_entropy_lambda, + use_PER=self.use_PER, + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + HER_target_clamping=self.kwargs['HER_target_clamping'] if 'HER_target_clamping' in self.kwargs else False, + iteration_count=self.param_update_counter, + summary_writer=None, + kwargs=self.kwargs + ) + + end = time.time() + + if self.summary_writer is not None: + self.summary_writer.add_scalar('PerUpdate/TimeComplexity/TDErrorComputation', end-start, self.param_update_counter) + self.summary_writer.flush() + + return loss, loss_per_item + + def sample_from_rnn_states(self, rnn_states, next_rnn_states, batch_indices, use_cuda): + sampled_rnn_states = _extract_rnn_states_from_batch_indices(rnn_states, batch_indices, use_cuda=self.kwargs['use_cuda']) + sampled_next_rnn_states = _extract_rnn_states_from_batch_indices(next_rnn_states, batch_indices, use_cuda=self.kwargs['use_cuda']) + return sampled_rnn_states, sampled_next_rnn_states + + def _update_replay_buffer_priorities(self, + sampled_losses_per_item: List[torch.Tensor], + array_batch_indices: List, + minibatch_size: int): + ''' + Updates the priorities of each sampled elements from their respective storages. + + TODO: update to useing Ray and get_tree_indices + ''' + # losses corresponding to sampled batch indices: + sampled_losses_per_item = torch.cat(sampled_losses_per_item, dim=0).cpu().detach().numpy() + for sloss, arr_bidx in zip(sampled_losses_per_item, array_batch_indices): + storage_idx = arr_bidx//minibatch_size + el_idx_in_batch = arr_bidx%minibatch_size + el_idx_in_storage = self.storages[storage_idx].tree_indices[el_idx_in_batch] + new_priority = self.storages[storage_idx].priority(sloss) + self.storages[storage_idx].update(idx=el_idx_in_storage, priority=new_priority) + + def clone(self, with_replay_buffer: bool=False, clone_proxies: bool=False, minimal=False): + if not(with_replay_buffer): + storages = self.storages + self.storages = None + + sum_writer = self.summary_writer + self.summary_writer = None + + param_update_counter = self._param_update_counter + self._param_update_counter = None + + cloned_algo = copy.deepcopy(self) + + if minimal: + cloned_algo.target_model = None + + if not(with_replay_buffer): + self.storages = storages + + self.summary_writer = sum_writer + + self._param_update_counter = param_update_counter + cloned_algo._param_update_counter = param_update_counter + + # Goes through all variables 'Proxy' (dealing with multiprocessing) + # contained in this class and removes them from clone + if not(clone_proxies): + proxy_key_values = [ + (key, value) + for key, value in cloned_algo.__dict__.items() + if ('Proxy' in str(type(value))) + ] + for key, value in proxy_key_values: + setattr(cloned_algo, key, None) + + return cloned_algo + + def async_actor(self): + storages = self.storages + self.storages = None + + sum_writer = self.summary_writer + self.summary_writer = None + + param_update_counter = self._param_update_counter + self._param_update_counter = None + + cloned_algo = copy.deepcopy(self) + + self.storages = storages + cloned_algo.storages = storages + + self.summary_writer = sum_writer + cloned_algo.summary_writer = sum_writer + + self._param_update_counter = param_update_counter + cloned_algo._param_update_counter = param_update_counter + + return cloned_algo + diff --git a/regym/rl_algorithms/algorithms/algorithm.py b/regym/rl_algorithms/algorithms/algorithm.py index 13bb2daf..493a0d7c 100644 --- a/regym/rl_algorithms/algorithms/algorithm.py +++ b/regym/rl_algorithms/algorithms/algorithm.py @@ -10,6 +10,7 @@ class Algorithm(object): def __init__(self, name="algo"): self.name = name + self.unwrapped = self def get_models(self): raise NotImplementedError @@ -23,5 +24,8 @@ def get_nbr_actor(self): def get_update_count(self): raise NotImplementedError + def parameters(self): + raise NotImplementedError + def clone(self, with_replay_buffer=False): raise NotImplementedError \ No newline at end of file diff --git a/regym/rl_algorithms/algorithms/wrappers/__init__.py b/regym/rl_algorithms/algorithms/wrappers/__init__.py index 013ee45b..1edbb5be 100644 --- a/regym/rl_algorithms/algorithms/wrappers/__init__.py +++ b/regym/rl_algorithms/algorithms/wrappers/__init__.py @@ -1,3 +1,6 @@ from .algorithm_wrapper import AlgorithmWrapper from .her_wrapper import HERAlgorithmWrapper, latent_based_goal_predicated_reward_fn -from .ther_wrapper import THERAlgorithmWrapper, predictor_based_goal_predicated_reward_fn \ No newline at end of file +from .her_wrapper2 import HERAlgorithmWrapper2, latent_based_goal_predicated_reward_fn2 + +from .ther_wrapper import THERAlgorithmWrapper, predictor_based_goal_predicated_reward_fn +from .ther_wrapper2 import THERAlgorithmWrapper2, predictor_based_goal_predicated_reward_fn2 \ No newline at end of file diff --git a/regym/rl_algorithms/algorithms/wrappers/algorithm_wrapper.py b/regym/rl_algorithms/algorithms/wrappers/algorithm_wrapper.py index c269ebdc..de08d710 100644 --- a/regym/rl_algorithms/algorithms/wrappers/algorithm_wrapper.py +++ b/regym/rl_algorithms/algorithms/wrappers/algorithm_wrapper.py @@ -3,6 +3,7 @@ class AlgorithmWrapper(Algorithm): def __init__(self, algorithm): self.algorithm = algorithm + self.unwrapped = self.algorithm.unwrapped @property def kwargs(self): diff --git a/regym/rl_algorithms/algorithms/wrappers/her_wrapper2.py b/regym/rl_algorithms/algorithms/wrappers/her_wrapper2.py new file mode 100644 index 00000000..45b827bb --- /dev/null +++ b/regym/rl_algorithms/algorithms/wrappers/her_wrapper2.py @@ -0,0 +1,212 @@ +from typing import Dict, Optional + +import torch +import numpy as np +import copy +from regym.rl_algorithms.algorithms.wrappers.algorithm_wrapper import AlgorithmWrapper +from regym.rl_algorithms.utils import copy_hdict + + +def state_eq_goal_reward_fn2(state, goal, epsilon=1e-3): + if torch.abs(state-goal).mean() < epsilon: + return torch.zeros(1), state + else: + return -torch.ones(1), state + + +def state_eq_goal_reward_fn2(achieved_exp, desired_exp, _extract_goals_from_info_fn, epsilon=1e-3): + state = achieved_exp['succ_s'] + #goal = desired_exp['goals']['achieved_goals']['s'] + #goal = _extract_goals_from_rnn_states_fn(desired_exp['info'], goal_key="achieved_goal") + #goal = torch.from_numpy(desired_exp['info']['achieved_goal']).float() + goal = _extract_goals_from_info_fn(desired_exp['info'], goal_key="achieved_goal") + if torch.abs(state-goal).mean() < epsilon: + return torch.zeros(1,1), goal + else: + return -torch.ones(1,1), goal + + +def latent_based_goal_predicated_reward_fn2(achieved_exp, desired_exp, epsilon=1e-3): + raise NotImplementedError + state = achieved_exp['info']['latents']['succ_s'] + goal = desired_exp['info']['latents']['achieved_goal'] + abs_fn = torch.abs + if not(isinstance(state, torch.Tensor)): abs_fn = np.abs + if abs_fn(state-goal).mean() < epsilon: + return torch.zeros(1), desired_exp['goals']['achieved_goals']['s'] + else: + return -torch.ones(1), desired_exp['goals']['achieved_goals']['s'] + +class HERAlgorithmWrapper2(AlgorithmWrapper): + def __init__(self, algorithm, extra_inputs_infos, strategy="future-4", goal_predicated_reward_fn=None): + super(HERAlgorithmWrapper2, self).__init__(algorithm=algorithm) + if goal_predicated_reward_fn is None: goal_predicated_reward_fn = state_eq_goal_reward_fn2 + + self.extra_inputs_infos = extra_inputs_infos + + self.episode_buffer = [[] for i in range(self.algorithm.get_nbr_actor())] + self.strategy = strategy + assert( ('future' in self.strategy or 'final' in self.strategy) and '-' in self.strategy) + self.k = int(self.strategy.split('-')[-1]) + self.goal_predicated_reward_fn = goal_predicated_reward_fn + self.episode_count = 0 + + def _update_goals_in_rnn_states(self, hdict:Dict, goal_value:torch.Tensor, goal_key:Optional[str]='desired_goal'): + if goal_key in self.extra_inputs_infos: + if not isinstance(self.extra_inputs_infos[goal_key]['target_location'][0], list): + self.extra_inputs_infos[goal_key]['target_location'] = [self.extra_inputs_infos[goal_key]['target_location']] + for tl in self.extra_inputs_infos[goal_key]['target_location']: + pointer = hdict + for child_node in tl: + if child_node not in pointer: + pointer[child_node] = {} + pointer = pointer[child_node] + pointer[goal_key] = [goal_value] + return hdict + + def _extract_goals_from_rnn_states(self, hdict:Dict, goal_key:Optional[str]='desired_goal'): + import ipdb; ipdb.set_trace() + assert goal_key in self.extra_inputs_infos + tl = self.extra_inputs_infos[goal_key]['target_location'][-1] + pointer = hdict + for child_node in tl: + if child_node not in pointer: + pointer[child_node] = {} + pointer = pointer[child_node] + return pointer[goal_key] + + def _extract_goals_from_info(self, hdict:Dict, goal_key:Optional[str]='desired_goal'): + assert goal_key in hdict + value = hdict[goal_key] + postprocess_fn=(lambda x:torch.from_numpy(x).float() if isinstance(x, np.ndarray) else torch.ones(1, 1).float()*x) + return postprocess_fn(value) + + def store(self, exp_dict, actor_index=0): + self.episode_buffer[actor_index].append(exp_dict) + + if not(exp_dict['non_terminal']): + episode_length = len(self.episode_buffer[actor_index]) + per_episode_d2store = {} + + for idx in range(episode_length): + s = self.episode_buffer[actor_index][idx]['s'] + a = self.episode_buffer[actor_index][idx]['a'] + r = self.episode_buffer[actor_index][idx]['r'] + succ_s = self.episode_buffer[actor_index][idx]['succ_s'] + non_terminal = self.episode_buffer[actor_index][idx]['non_terminal'] + + info = self.episode_buffer[actor_index][idx]['info'] + rnn_states = self.episode_buffer[actor_index][idx]['rnn_states'] + + #desired_goal = self.episode_buffer[actor_index][idx]['goals']['desired_goals']['s'] + #desired_goal = info['desired_goal'] + + d2store = { + 's':s, + 'a':a, + 'r':r, + 'succ_s':succ_s, + 'non_terminal':non_terminal, + + 'rnn_states': copy_hdict(rnn_states), + 'info': info, + #'g':desired_goal + } + #self.algorithm.store(d2store, actor_index=actor_index) + if -1 not in per_episode_d2store: per_episode_d2store[-1] = [] + per_episode_d2store[-1].append(d2store) + + if self.algorithm.summary_writer is not None and all(non_terminal<=0.5): + self.episode_count += 1 + self.algorithm.summary_writer.add_scalar('PerEpisode/Success', 1+r.mean().item(), self.episode_count) + + + for k in range(self.k): + d2store = {} + if 'final' in self.strategy: + #achieved_goal = self.episode_buffer[actor_index][-1]['goals']['achieved_goals']['s'] + + achieved_exp = self.episode_buffer[actor_index][idx] + desired_exp = self.episode_buffer[actor_index][-1] + new_r, achieved_goal = self.goal_predicated_reward_fn( + achieved_exp=achieved_exp, + desired_exp=desired_exp, + _extract_goals_from_info_fn=self._extract_goals_from_info + ) + + #new_non_terminal = torch.zeros(1) if all(new_r>-0.5) else torch.ones(1) + new_non_terminal = torch.zeros_like(non_terminal) if all(new_r>-0.5) else torch.ones_like(non_terminal) + + d2store = { + 's':s, + 'a':a, + 'r':new_r, + 'succ_s':succ_s, + 'non_terminal':new_non_terminal, + 'rnn_states': copy_hdict( + self._update_goals_in_rnn_states( + hdict=rnn_states, + goal_value=achieved_goal, + goal_key='desired_goal', + ) + ), + 'info': info, + #'g':achieved_goal + } + + if self.algorithm.summary_writer is not None: + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_final', new_r.mean().item(), self.algorithm.get_update_count()) + + if 'future' in self.strategy: + future_idx = np.random.randint(idx, episode_length) + #achieved_goal = self.episode_buffer[actor_index][future_idx]['goals']['achieved_goals']['s'] + + achieved_exp = self.episode_buffer[actor_index][idx] + desired_exp = self.episode_buffer[actor_index][future_idx] + new_r, achieved_goal = self.goal_predicated_reward_fn( + achieved_exp=achieved_exp, + desired_exp=desired_exp, + _extract_goals_from_info_fn=self._extract_goals_from_info + ) + + new_non_terminal = torch.zeros_like(non_terminal) if all(new_r>-0.5) else torch.ones_like(non_terminal) + d2store = { + 's':s, + 'a':a, + 'r':new_r, + 'succ_s':succ_s, + 'non_terminal':new_non_terminal, + + 'rnn_states': copy_hdict( + self._update_goals_in_rnn_states( + hdict=rnn_states, + goal_value=achieved_goal, + goal_key='desired_goal', + ) + ), + 'info': info, + #'g':achieved_goal + } + + if self.algorithm.summary_writer is not None: + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_future', new_r.mean().item(), self.algorithm.get_update_count()) + + #self.algorithm.store(d2store, actor_index=actor_index) + if k not in per_episode_d2store: per_episode_d2store[k] = [] + per_episode_d2store[k].append(d2store) + + for key in per_episode_d2store: + for d2store in per_episode_d2store[key]: + self.algorithm.store(d2store, actor_index=actor_index) + + + # Reset episode buffer: + self.episode_buffer[actor_index] = [] + + def clone(self): + return HERAlgorithmWrapper2( + algorithm=self.algorithm.clone(), + extra_inputs_infos=copy.deepcopy(self.extra_inputs_infos), + strategy=self.strategy, + goal_predicated_reward_fn=self.goal_predicated_reward_fn, + ) \ No newline at end of file diff --git a/regym/rl_algorithms/algorithms/wrappers/ther_wrapper2.py b/regym/rl_algorithms/algorithms/wrappers/ther_wrapper2.py new file mode 100644 index 00000000..37f25d9f --- /dev/null +++ b/regym/rl_algorithms/algorithms/wrappers/ther_wrapper2.py @@ -0,0 +1,589 @@ +from typing import Dict, Optional, List + +import torch +import torch.optim as optim +import torch.nn as nn + +import numpy as np +from ..algorithm import Algorithm +from ...networks import random_sample + +from .algorithm_wrapper import AlgorithmWrapper + +from .her_wrapper2 import state_eq_goal_reward_fn2 + +from ...replay_buffers import PrioritizedReplayStorage, SplitReplayStorage, SplitPrioritizedReplayStorage +from regym.rl_algorithms.utils import _extract_rnn_states_from_batch_indices, _concatenate_hdict, copy_hdict + + +def predictor_based_goal_predicated_reward_fn2(predictor, achieved_exp, desired_exp, _extract_goals_from_info_fn=None, epsilon=1e0): + ''' + Relabelling an unsuccessful trajectory, so the desired_exp's goal is not interesting. + We want to know the goal that is achieved on the desired_exp succ_s / desired_state. + + Comparison between the predicted goal of the achieved state and the desired state + tells us whether the achieved state is achieving the relabelling goal. + + Returns -1 for failure and 0 for success + ''' + state = achieved_exp['succ_s'] + desired_state = desired_exp['succ_s'] + with torch.no_grad(): + achieved_pred_goal = predictor(state).cpu() + desired_pred_goal = predictor(desired_state).cpu() + abs_fn = torch.abs + dist = abs_fn(achieved_pred_goal-desired_pred_goal).float().mean() + if dist < epsilon: + return torch.zeros(1), achieved_pred_goal, desired_pred_goal, dist + else: + return -torch.ones(1), achieved_pred_goal, desired_pred_goal, dist + + +class THERAlgorithmWrapper2(AlgorithmWrapper): + def __init__(self, + algorithm, + predictor, + predictor_loss_fn, + strategy="future-4", + goal_predicated_reward_fn=None, + #rewards={'failure':-1, 'success':0} + rewards={'failure':0, 'success':1} + ): + super(THERAlgorithmWrapper2, self).__init__(algorithm=algorithm) + self.rewards = rewards + + self.predictor = predictor + if self.kwargs['use_cuda']: + self.predictor = self.predictor.cuda() + + self.predictor_loss_fn = predictor_loss_fn + print(f"WARNING: THER loss_fn is {self.predictor_loss_fn}") + + # Tuning learning rate with respect to the number of actors: + # Following: https://arxiv.org/abs/1705.04862 + lr = self.kwargs['THER_predictor_learning_rate'] + if self.kwargs['lr_account_for_nbr_actor']: + lr *= self.nbr_actor + print(f"THER Predictor Learning rate: {lr}") + self.predictor_optimizer = optim.Adam( + self.predictor.parameters(), + lr=lr, betas=(0.9,0.999), + eps=self.kwargs['adam_eps'] + ) + + self.predictor_storages = None + self._reset_predictor_storages() + + self.episode_buffer = [[] for i in range(self.algorithm.get_nbr_actor())] + self.strategy = strategy + assert( ('future' in self.strategy or 'final' in self.strategy) and '-' in self.strategy) + self.k = int(self.strategy.split('-')[-1]) + + if goal_predicated_reward_fn is None: goal_predicated_reward_fn = state_eq_goal_reward_fn2 + self.goal_predicated_reward_fn = goal_predicated_reward_fn + + self.episode_count = 0 + self.param_predictor_update_counter = 0 + + self.nbr_buffered_predictor_experience = 0 + self.nbr_handled_predictor_experience = 0 + self.batch_size = self.kwargs['THER_predictor_batch_size'] + + def _reset_predictor_storages(self): + if self.predictor_storages is not None: + for storage in self.predictor_storages: storage.reset() + + self.predictor_storages = [] + keys = ['s', 'a', 'r', 'non_terminal'] + if self.recurrent: keys += ['rnn_states'] + + circular_keys={'succ_s':'s'} + circular_offsets={'succ_s':self.n_step} + if self.recurrent: + circular_keys.update({'next_rnn_states':'rnn_states'}) + circular_offsets.update({'next_rnn_states':1}) + + beta_increase_interval = None + if 'PER_beta_increase_interval' in self.kwargs and self.kwargs['PER_beta_increase_interval']!='None': + beta_increase_interval = float(self.kwargs['PER_beta_increase_interval']) + + for i in range(self.nbr_actor): + if self.kwargs['THER_use_PER']: + self.predictor_storages.append( + SplitPrioritizedReplayStorage( + capacity=self.kwargs['THER_replay_capacity'], + alpha=self.kwargs['THER_PER_alpha'], + beta=self.kwargs['THER_PER_beta'], + keys=keys, + circular_offsets={'succ_s':1}, + test_train_split_interval=self.kwargs['THER_predictor_test_train_split_interval'], + test_capacity=self.kwargs['THER_test_replay_capacity'] + ) + ) + else: + self.predictor_storages.append( + SplitReplayStorage( + capacity=self.kwargs['THER_replay_capacity'], + keys=keys, + circular_offsets={'succ_s':1}, + test_train_split_interval=self.kwargs['THER_predictor_test_train_split_interval'], + test_capacity=self.kwargs['THER_test_replay_capacity'] + ) + ) + + def _update_goals_in_rnn_states(self, hdict:Dict, goal_value:torch.Tensor, goal_key:Optional[str]='desired_goal'): + if goal_key in self.extra_inputs_infos: + if not isinstance(self.extra_inputs_infos[goal_key]['target_location'][0], list): + self.extra_inputs_infos[goal_key]['target_location'] = [self.extra_inputs_infos[goal_key]['target_location']] + for tl in self.extra_inputs_infos[goal_key]['target_location']: + pointer = hdict + for child_node in tl: + if child_node not in pointer: + pointer[child_node] = {} + pointer = pointer[child_node] + pointer[goal_key] = [goal_value] + return hdict + + def _extract_goals_from_rnn_states(self, hdict:Dict, goal_key:Optional[str]='desired_goal'): + import ipdb; ipdb.set_trace() + assert goal_key in self.extra_inputs_infos + tl = self.extra_inputs_infos[goal_key]['target_location'][-1] + pointer = hdict + for child_node in tl: + if child_node not in pointer: + pointer[child_node] = {} + pointer = pointer[child_node] + return pointer[goal_key] + + def _extract_goals_from_info(self, hdict:Dict, goal_key:Optional[str]='desired_goal'): + assert goal_key in hdict + value = hdict[goal_key] + postprocess_fn=(lambda x:torch.from_numpy(x).float() if isinstance(x, np.ndarray) else torch.ones(1, 1).float()*x) + return postprocess_fn(value) + + def store(self, exp_dict, actor_index=0): + self.episode_buffer[actor_index].append(exp_dict) + self.nbr_buffered_predictor_experience += 1 + + successful_traj = False + + if not(exp_dict['non_terminal']): + episode_length = len(self.episode_buffer[actor_index]) + + # Assumes non-successful rewards are non-positive: + successful_traj = all(self.episode_buffer[actor_index][-1]['r']>0) + + # Relabelling if unsuccessfull trajectory: + relabelling = not successful_traj + + episode_rewards = [] + per_episode_d2store = {} + + for idx in range(episode_length): + s = self.episode_buffer[actor_index][idx]['s'] + a = self.episode_buffer[actor_index][idx]['a'] + + r = self.episode_buffer[actor_index][idx]['r'] + # Assumes failure rewards are non-positive: + her_r = self.rewards['success']*torch.ones(1) if all(r>0) else self.rewards['failure']*torch.ones(1) + + succ_s = self.episode_buffer[actor_index][idx]['succ_s'] + non_terminal = self.episode_buffer[actor_index][idx]['non_terminal'] + + info = self.episode_buffer[actor_index][idx]['info'] + rnn_states = self.episode_buffer[actor_index][idx]['rnn_states'] + #desired_goal = self.episode_buffer[actor_index][idx]['goals']['desired_goals']['s'] + + episode_rewards.append(r) + + d2store = { + 's':s, + 'a':a, + 'r':her_r, + 'succ_s':succ_s, + 'non_terminal':non_terminal, + 'rnn_states':copy_hdict(rnn_states), + 'info': info, + #'g':desired_goal, + } + + if not(relabelling): + # Only insert this experience that way if successfull: + #self.algorithm.store(d2store, actor_index=actor_index) + if -1 not in per_episode_d2store: per_episode_d2store[-1] = [] + per_episode_d2store[-1].append(d2store) + + # Store data in predictor storages if successfull: + if self.kwargs['THER_use_THER'] and all(r>self.rewards['failure']): + self.predictor_store(d2store, actor_index=actor_index) + self.algorithm.summary_writer.add_scalar('Training/THER_Predictor/DatasetSize', self.nbr_handled_predictor_experience, self.param_predictor_update_counter) + + if self.algorithm.summary_writer is not None and all(non_terminal<=0.5): + self.episode_count += 1 + self.algorithm.summary_writer.add_scalar('PerEpisode/Success', (self.rewards['success']==her_r).float().mean().item(), self.episode_count) + self.algorithm.summary_writer.add_histogram('PerEpisode/Rewards', episode_rewards, self.episode_count) + + + # Are we relabelling? + if not(self.kwargs['THER_use_THER']) or not(relabelling): + continue + + # Relabelling everything with the hindsight_goal computed on the fly, and set the reward accordingly: + for k in range(self.k): + if 'final' in self.strategy: + achieved_exp = self.episode_buffer[actor_index][idx] + desired_exp = self.episode_buffer[actor_index][-1] + + new_r, achieved_pred_goal, desired_pred_goal, dist = self.goal_predicated_reward_fn( + achieved_exp=achieved_exp, + desired_exp=desired_exp, + _extract_goals_from_info_fn=self._extract_goals_from_info, + ) + + # Assumes new_r to be -1 for failure and 0 for success: + #new_her_r = self.rewards['success']*torch.ones(1) if all(new_r>-0.5) else self.rewards['failure']*torch.ones(1) + new_her_r = self.rewards['success']*torch.ones_like(r) if all(new_r>-0.5) else self.rewards['failure']*torch.ones_like(r) + + #new_non_terminal = torch.zeros(1) if all(new_her_r>self.rewards['failure']) else torch.ones(1) + new_non_terminal = torch.zeros_like(non_terminal) if all(new_her_r>self.rewards['failure']) else torch.ones_like(non_terminal) + + d2store_her = { + 's':s, + 'a':a, + 'r':new_her_r, + 'succ_s':succ_s, + 'non_terminal':new_non_terminal, + 'rnn_states': copy_hdict( + self._update_goals_in_rnn_states( + hdict=rnn_states, + goal_value=desired_pred_goal, + goal_key='desired_goal', + ) + ), + 'info': info, + #'g': desired_pred_goal, + } + + if self.algorithm.summary_writer is not None: + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_final', new_her_r.mean().item(), self.algorithm.get_update_count()) + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_dist', dist.mean().item(), self.algorithm.get_update_count()) + + if 'future' in self.strategy: + future_idx = np.random.randint(idx, episode_length) + achieved_exp = self.episode_buffer[actor_index][idx] + desired_exp = self.episode_buffer[actor_index][future_idx] + + new_r, achieved_pred_goal, desired_pred_goal, dist = self.goal_predicated_reward_fn( + achieved_exp=achieved_exp, + desired_exp=desired_exp, + _extract_goals_from_info_fn=self._extract_goals_from_info, + ) + + # Assumes new_r to be -1 for failure and 0 for success: + new_her_r = self.rewards['success']*torch.ones_like(r) if all(new_r>-0.5) else self.rewards['failure']*torch.ones_like(r) + + new_non_terminal = torch.zeros_like(non_terminal) if all(new_her_r>self.rewards['failure']) else torch.ones_like(non_terminal) + + d2store_her = { + 's':s, + 'a':a, + 'r':new_her_r, + 'succ_s':succ_s, + 'non_terminal':new_non_terminal, + + 'rnn_states': copy_hdict( + self._update_goals_in_rnn_states( + hdict=rnn_states, + goal_value=desired_pred_goal, + goal_key='desired_goal', + ) + ), + 'info': info, + #'g': desired_pred_goal, + } + + if self.algorithm.summary_writer is not None: + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_future', new_her_r.mean().item(), self.algorithm.get_update_count()) + self.algorithm.summary_writer.add_scalar('PerUpdate/HER_reward_dist', dist.mean().item(), self.algorithm.get_update_count()) + + # Adding this relabelled experience to the replay buffer with 'proper' goal... + #self.algorithm.store(d2store_her, actor_index=actor_index) + if k not in per_episode_d2store: per_episode_d2store[k] = [] + per_episode_d2store[k].append(d2store_her) + + for key in per_episode_d2store: + for d2st in per_episode_d2store[key]: + self.algorithm.store(d2st, actor_index=actor_index) + + # Reset episode buffer: + self.episode_buffer[actor_index] = [] + + period_check = self.kwargs['THER_replay_period'] + period_count_check = self.nbr_buffered_predictor_experience + + # Update predictor: + if self.nbr_handled_predictor_experience >= self.kwargs['THER_min_capacity']\ + and ((period_count_check % period_check == 0) or (self.kwargs['THER_train_on_success'] and successful_traj)): + self.update_predictor() + + def predictor_store(self, exp_dict, actor_index=0): + self.nbr_handled_predictor_experience += 1 + + if self.kwargs['THER_use_PER']: + init_sampling_priority = None + self.predictor_storages[actor_index].add(exp_dict, priority=init_sampling_priority) + else: + self.predictor_storages[actor_index].add(exp_dict) + + def update_predictor(self): + for it in range(self.kwargs['THER_nbr_training_iteration_per_update']): + acc = self.train_predictor() + if acc >= self.kwargs['THER_predictor_accuracy_threshold']: + break + + def train_predictor(self, minibatch_size=None): + if minibatch_size is None: minibatch_size = self.batch_size + + samples = self.retrieve_values_from_predictor_storages(minibatch_size=minibatch_size) + + self.optimize_predictor(minibatch_size, samples) + + test_samples = self.retrieve_values_from_predictor_storages(minibatch_size=minibatch_size, test=True) + + with torch.no_grad(): + acc = self.test_predictor(minibatch_size, test_samples) + + if self.algorithm.summary_writer is not None: + self.algorithm.summary_writer.add_scalar('PerPredictorUpdate/TestSentenceAccuracy', acc, self.param_predictor_update_counter) + + return acc + + def retrieve_values_from_predictor_storages(self, minibatch_size, test=False): + keys=['s', 'a', 'succ_s', 'r', 'non_terminal', 'g'] + + fulls = {} + + if self.kwargs['THER_use_PER']: + fulls['importanceSamplingWeights'] = [] + + if self.recurrent: + keys += ['rnn_states'] + + for key in keys: fulls[key] = [] + + for storage in self.predictor_storages: + # Check that there is something in the storage + if len(storage) <= 1: continue + batch_size = minibatch_size + if batch_size is None: + batch_size = storage.get_size(test=test) + + if self.kwargs['THER_use_PER']: + sample, importanceSamplingWeights = storage.sample(batch_size=batch_size, keys=keys, test=test) + importanceSamplingWeights = torch.from_numpy(importanceSamplingWeights) + fulls['importanceSamplingWeights'].append(importanceSamplingWeights) + else: + sample = storage.sample(batch_size=batch_size, keys=keys, test=test) + + values = {} + for key, value in zip(keys, sample): + value = value.tolist() + if isinstance(value[0], dict): + value = _concatenate_hdict(value.pop(0), value, map_keys=['hidden', 'cell']) + else: + value = torch.cat(value, dim=0) + values[key] = value + + for key, value in values.items(): + if isinstance(value, torch.Tensor): + fulls[key].append(value) + else: + fulls[key] = value + + for key, value in fulls.items(): + if isinstance(value, dict): + fulls[key] = value + else: + fulls[key] = torch.cat(value, dim=0) + + return fulls + + def optimize_predictor(self, minibatch_size, samples): + beta = self.predictor_storages[0].beta if self.kwargs['THER_use_PER'] else 1.0 + + states = samples['s'] + actions = samples['a'] + next_states = samples['succ_s'] + rewards = samples['r'] + non_terminals = samples['non_terminal'] + goals = samples['g'] + + rnn_states = samples['rnn_states'] if 'rnn_states' in samples else None + + importanceSamplingWeights = samples['importanceSamplingWeights'] if 'importanceSamplingWeights' in samples else None + + # For each actor, there is one mini_batch update: + sampler = random_sample(np.arange(states.size(0)), minibatch_size) + list_batch_indices = [storage_idx*minibatch_size+np.arange(minibatch_size) \ + for storage_idx, storage in enumerate(self.predictor_storages)] + array_batch_indices = np.concatenate(list_batch_indices, axis=0) + sampled_batch_indices = [] + sampled_losses_per_item = [] + + for batch_indices in sampler: + batch_indices = torch.from_numpy(batch_indices).long() + sampled_batch_indices.append(batch_indices) + + sampled_rnn_states = None + if self.recurrent: + sampled_rnn_states = _extract_rnn_states_from_batch_indices(rnn_states, batch_indices, use_cuda=self.kwargs['use_cuda']) + + sampled_importanceSamplingWeights = None + if self.kwargs['THER_use_PER']: + sampled_importanceSamplingWeights = importanceSamplingWeights[batch_indices].cuda() if self.kwargs['use_cuda'] else importanceSamplingWeights[batch_indices] + + sampled_states = states[batch_indices].cuda() if self.kwargs['use_cuda'] else states[batch_indices] + sampled_actions = actions[batch_indices].cuda() if self.kwargs['use_cuda'] else actions[batch_indices] + sampled_next_states = next_states[batch_indices].cuda() if self.kwargs['use_cuda'] else next_states[batch_indices] + sampled_rewards = rewards[batch_indices].cuda() if self.kwargs['use_cuda'] else rewards[batch_indices] + sampled_non_terminals = non_terminals[batch_indices].cuda() if self.kwargs['use_cuda'] else non_terminals[batch_indices] + sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + + self.predictor_optimizer.zero_grad() + + output_dict = self.predictor_loss_fn(sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + goals=sampled_goals, + rnn_states=sampled_rnn_states, + predictor=self.predictor, + weights_decay_lambda=self.kwargs['THER_weights_decay_lambda'], + use_PER=self.kwargs['THER_use_PER'], + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + iteration_count=self.param_predictor_update_counter, + summary_writer=self.algorithm.summary_writer) + + loss = output_dict['loss'] + loss_per_item = output_dict['loss_per_item'] + + + loss.backward(retain_graph=False) + if self.kwargs['THER_gradient_clip'] > 1e-3: + nn.utils.clip_grad_norm_(self.predictor.parameters(), self.kwargs['THER_gradient_clip']) + self.predictor_optimizer.step() + + if self.kwargs['THER_use_PER']: + sampled_losses_per_item.append(loss_per_item) + + self.param_predictor_update_counter += 1 + + if self.kwargs['THER_use_PER']: + # losses corresponding to sampled batch indices: + sampled_losses_per_item = torch.cat(sampled_losses_per_item, dim=0).cpu().detach().numpy() + sampled_batch_indices = np.concatenate(sampled_batch_indices, axis=0) + # let us align the batch indices with the losses: + array_batch_indices = array_batch_indices[sampled_batch_indices] + # Now we can iterate through the losses and retrieve what + # storage and what batch index they were associated with: + for sloss, arr_bidx in zip(sampled_losses_per_item, array_batch_indices): + storage_idx = arr_bidx//minibatch_size + el_idx_in_batch = arr_bidx%minibatch_size + el_idx_in_storage = self.predictor_storages[storage_idx].tree_indices[el_idx_in_batch] + new_priority = self.predictor_storages[storage_idx].priority(sloss) + self.predictor_storages[storage_idx].update(idx=el_idx_in_storage, priority=new_priority) + + + def test_predictor(self, minibatch_size, samples): + beta = self.predictor_storages[0].beta if self.kwargs['THER_use_PER'] else 1.0 + + states = samples['s'] + actions = samples['a'] + next_states = samples['succ_s'] + rewards = samples['r'] + non_terminals = samples['non_terminal'] + goals = samples['g'] + + rnn_states = samples['rnn_states'] if 'rnn_states' in samples else None + + importanceSamplingWeights = samples['importanceSamplingWeights'] if 'importanceSamplingWeights' in samples else None + + # For each actor, there is one mini_batch update: + sampler = random_sample(np.arange(states.size(0)), minibatch_size) + list_batch_indices = [storage_idx*minibatch_size+np.arange(minibatch_size) \ + for storage_idx, storage in enumerate(self.predictor_storages)] + array_batch_indices = np.concatenate(list_batch_indices, axis=0) + sampled_batch_indices = [] + sampled_losses_per_item = [] + + running_acc = 0 + nbr_batches = 0 + for batch_indices in sampler: + nbr_batches += 1 + batch_indices = torch.from_numpy(batch_indices).long() + sampled_batch_indices.append(batch_indices) + + sampled_rnn_states = None + if self.recurrent: + sampled_rnn_states = _extract_rnn_states_from_batch_indices(rnn_states, batch_indices, use_cuda=self.kwargs['use_cuda']) + + sampled_importanceSamplingWeights = None + if self.kwargs['THER_use_PER']: + sampled_importanceSamplingWeights = importanceSamplingWeights[batch_indices].cuda() if self.kwargs['use_cuda'] else importanceSamplingWeights[batch_indices] + + sampled_states = states[batch_indices].cuda() if self.kwargs['use_cuda'] else states[batch_indices] + sampled_actions = actions[batch_indices].cuda() if self.kwargs['use_cuda'] else actions[batch_indices] + sampled_next_states = next_states[batch_indices].cuda() if self.kwargs['use_cuda'] else next_states[batch_indices] + sampled_rewards = rewards[batch_indices].cuda() if self.kwargs['use_cuda'] else rewards[batch_indices] + sampled_non_terminals = non_terminals[batch_indices].cuda() if self.kwargs['use_cuda'] else non_terminals[batch_indices] + sampled_goals = goals[batch_indices].cuda() if self.kwargs['use_cuda'] else goals[batch_indices] + + output_dict = self.predictor_loss_fn(sampled_states, + sampled_actions, + sampled_next_states, + sampled_rewards, + sampled_non_terminals, + goals=sampled_goals, + rnn_states=sampled_rnn_states, + predictor=self.predictor, + weights_decay_lambda=self.kwargs['THER_weights_decay_lambda'], + use_PER=self.kwargs['THER_use_PER'], + PER_beta=beta, + importanceSamplingWeights=sampled_importanceSamplingWeights, + iteration_count=self.param_predictor_update_counter, + summary_writer=self.algorithm.summary_writer) + + loss = output_dict['loss'] + loss_per_item = output_dict['loss_per_item'] + + accuracy = output_dict['accuracy'] + running_acc = running_acc + accuracy + + if self.kwargs['THER_use_PER']: + sampled_losses_per_item.append(loss_per_item) + + if self.kwargs['THER_use_PER']: + # losses corresponding to sampled batch indices: + sampled_losses_per_item = torch.cat(sampled_losses_per_item, dim=0).cpu().detach().numpy() + sampled_batch_indices = np.concatenate(sampled_batch_indices, axis=0) + # let us align the batch indices with the losses: + array_batch_indices = array_batch_indices[sampled_batch_indices] + # Now we can iterate through the losses and retrieve what + # storage and what batch index they were associated with: + for sloss, arr_bidx in zip(sampled_losses_per_item, array_batch_indices): + storage_idx = arr_bidx//minibatch_size + el_idx_in_batch = arr_bidx%minibatch_size + el_idx_in_storage = self.predictor_storages[storage_idx].get_test_storage().tree_indices[el_idx_in_batch] + new_priority = self.predictor_storages[storage_idx].priority(sloss) + self.predictor_storages[storage_idx].update(idx=el_idx_in_storage, priority=new_priority, test=True) + + running_acc = running_acc / nbr_batches + return running_acc + + def clone(self): + return THERAlgorithmWrapper2(algorithm=self.algorithm.clone(), + predictor=predictor, + strategy=self.strategy, + goal_predicated_reward_fn=self.goal_predicated_reward_fn) \ No newline at end of file diff --git a/regym/rl_algorithms/networks/bodies.py b/regym/rl_algorithms/networks/bodies.py index 83cca917..16c1f3fe 100644 --- a/regym/rl_algorithms/networks/bodies.py +++ b/regym/rl_algorithms/networks/bodies.py @@ -56,7 +56,18 @@ def reset_noisy_layer(module): module._reset_noise() class ConvolutionalBody(nn.Module): - def __init__(self, input_shape, feature_dim=256, channels=[3, 3], kernel_sizes=[1], strides=[1], paddings=[0], dropout=0.0, non_linearities=[nn.ReLU]): + def __init__( + self, + input_shape, + feature_dim=256, + channels=[3, 3], + kernel_sizes=[1], + strides=[1], + paddings=[0], + dropout=0.0, + non_linearities=[nn.ReLU], + extra_inputs_infos: Dict={}, + ): ''' Default input channels assume a RGB image (3 channels). @@ -70,6 +81,8 @@ def __init__(self, input_shape, feature_dim=256, channels=[3, 3], kernel_sizes=[ :param dropout: dropout probability to use. :param non_linearities: list of non-linear nn.Functional functions to use after each convolutional layer. + + TODO: update calls to this constructor to use extra_inputs_infos if needs be... ''' super(ConvolutionalBody, self).__init__() self.dropout = dropout @@ -89,6 +102,13 @@ def __init__(self, input_shape, feature_dim=256, channels=[3, 3], kernel_sizes=[ h_dim = input_shape[1] w_dim = input_shape[2] in_ch = channels[0] + + for key in extra_inputs_infos: + shape = extra_inputs_infos[key]['shape'] + assert isinstance(shape, list) + assert shape[1]==h_dim and shape[2]==w_dim + in_ch += shape[-1] + for idx, (cfg, k, s, p) in enumerate(zip(channels[1:], kernel_sizes, strides, paddings)): if cfg == 'M': layer = nn.MaxPool2d(kernel_size=k, stride=s) @@ -144,6 +164,16 @@ def _compute_feat_map(self, x): return self.features(x) def forward(self, x, non_lin_output=True): + if isinstance(x, tuple): + x, frame_states = inputs[0], copy_hdict(inputs[1]) + extra_inputs = extract_subtree( + in_dict=frame_states, + node_id='extra_inputs', + ) + + extra_inputs = [v[0].to(x.dtype).to(x.device) for v in extra_inputs.values()] + if len(extra_inputs): x = torch.cat([x]+extra_inputs, dim=-1) + feat_map = self._compute_feat_map(x) # View -> Reshape @@ -1402,6 +1432,7 @@ def __init__( gate=F.relu, dropout=0.0, add_non_lin_final_layer=False, + use_residual_connection=False, layer_init_fn=layer_init, extra_inputs_infos: Dict={}, ): @@ -1420,6 +1451,7 @@ def __init__( ''' super(LinearLstmBody2, self).__init__() self.state_dim = state_dim + self.use_residual_connection = use_residual_connection # verify featureshape = feature_dim linear_input_dim = self.state_dim @@ -1433,6 +1465,7 @@ def __init__( if self.linear_hidden_units is None: raise NotImplementedError + # DummyBody? else: if isinstance(self.linear_hidden_units, tuple): self.linear_hidden_units = list(self.linear_hidden_units) @@ -1457,6 +1490,8 @@ def __init__( self.linear_post_hidden_units = self.linear_post_hidden_units + [feature_dim] linear_post_input_dim = self.lstm_body.get_feature_shape() + if self.use_residual_connection: linear_post_input_dim += self.linear_body.get_feature_shape() + self.linear_body_post = FCBody( state_dim=linear_post_input_dim, hidden_units=self.linear_post_hidden_units, @@ -1495,6 +1530,9 @@ def forward(self, inputs): x, recurrent_neurons['lstm_body'] = self.lstm_body( (features, recurrent_neurons['lstm_body'])) + if self.use_residual_connection: + x = torch.cat([features, x], dim=-1) + if self.linear_post_hidden_units is not None: x = self.linear_body_post(x) @@ -1507,10 +1545,17 @@ def get_input_shape(self): return self.state_dim def get_feature_shape(self): + fs = 0 + if self.use_residual_connection: + fs += self.linear_body.get_feature_shape() + if self.linear_post_hidden_units is None: - return self.lstm_body.get_feature_shape() + fs += self.lstm_body.get_feature_shape() else: - return self.linear_body_post.get_feature_shape() + fs += self.linear_body_post.get_feature_shape() + + return fs + class LSTMBody(nn.Module): def __init__( diff --git a/regym/rl_algorithms/networks/heads.py b/regym/rl_algorithms/networks/heads.py index 5c53e0c9..5143445b 100644 --- a/regym/rl_algorithms/networks/heads.py +++ b/regym/rl_algorithms/networks/heads.py @@ -81,6 +81,9 @@ def __init__( critic_input_shape = self.phi_body.get_feature_shape() + self.goal_oriented = False + """ + # depr: goal update if self.goal_oriented: self.goal_state_flattening = False assert(goal_shape is not None) @@ -93,7 +96,7 @@ def __init__( if not(self.goal_state_flattening): critic_input_shape += self.goal_phi_body.get_feature_shape() - + """ if critic_body is None: critic_body = DummyBody(critic_input_shape) self.critic_body = critic_body @@ -123,11 +126,14 @@ def reset_noise(self): def forward(self, obs, action=None, rnn_states=None, goal=None): batch_size = obs.shape[0] + """ + # depr : goal update if not(self.goal_oriented): assert(goal==None) if self.goal_oriented: if self.goal_state_flattening: obs = torch.cat([obs, goal], dim=1) + """ next_rnn_states = None if rnn_states is not None: @@ -138,15 +144,18 @@ def forward(self, obs, action=None, rnn_states=None, goal=None): else: phi = self.phi_body(obs) + """ + # depr: goal update gphi = None if self.goal_oriented and not(self.goal_state_flattening): + import ipdb; ipdb.set_trace() if rnn_states is not None and 'goal_phi_body' in rnn_states: gphi, next_rnn_states['goal_phi_body'] = self.goal_phi_body( (goal, rnn_states['goal_phi_body']) ) else: gphi = self.phi_body(goal) phi = torch.cat([phi, gphi], dim=1) - + """ if rnn_states is not None and 'critic_body' in rnn_states: phi_v, next_rnn_states['critic_body'] = self.critic_body( (phi, rnn_states['critic_body']) ) @@ -192,10 +201,14 @@ def forward(self, obs, action=None, rnn_states=None, goal=None): entropy = -torch.sum(probs*log_probs, dim=-1) # batch #x 1 + legal_probs = F.softmax( legal_qa, dim=-1 ) + legal_log_probs = torch.log(legal_probs+EPS) + prediction = { 'a': action, 'ent': entropy, - 'qa': qa + 'qa': qa, + 'log_a': legal_log_probs, } prediction.update({ @@ -282,10 +295,14 @@ def head_forward(phi, action=None, rnn_states=None, goal=None): entropy = -torch.sum(probs*log_probs, dim=-1) # batch #x 1 + legal_probs = F.softmax( legal_qa, dim=-1 ) + legal_log_probs = torch.log(legal_probs+EPS) + prediction = { 'a': action, 'ent': entropy, - 'qa': qa + 'qa': qa, + 'log_a': legal_log_probs, } prediction.update({ @@ -754,25 +771,60 @@ def forward(self, obs, action=None, rnn_states=None, goal=None): class ActorCriticNet(nn.Module): - def __init__(self, state_dim, action_dim, phi_body, actor_body, critic_body, use_intrinsic_critic=False, layer_init_fn=layer_init): + def __init__( + self, + state_dim, + action_dim, + phi_body, + actor_body, + critic_body, + use_intrinsic_critic=False, + extra_inputs_infos: Dict={}, + layer_init_fn=layer_init): + """ + :param extra_inputs_infos: Dictionnary containing the shape of the lstm-relevant extra inputs. + """ super(ActorCriticNet, self).__init__() + if phi_body is None: phi_body = DummyBody(state_dim) if actor_body is None: actor_body = DummyBody(phi_body.get_feature_shape()) if critic_body is None: critic_body = DummyBody(phi_body.get_feature_shape()) + self.phi_body = phi_body self.actor_body = actor_body self.critic_body = critic_body - self.fc_action = nn.Linear(actor_body.get_feature_shape(), action_dim) + + fc_critic_input_shape = self.critic_body.get_feature_shape() + fc_actor_input_shape = self.actor_body.get_feature_shape() + + if isinstance(fc_critic_input_shape, list): + fc_critic_input_shape = fc_critic_input_shape[-1] + if isinstance(fc_actor_input_shape, list): + fc_actor_input_shape = fc_actor_input_shape[-1] + + for key in extra_inputs_infos['critic']: + shape = extra_inputs_infos[key]['shape'] + assert len(shape) == 1 + fc_critic_input_shape += shape[-1] + for key in extra_inputs_infos['actor']: + shape = extra_inputs_infos[key]['shape'] + assert len(shape) == 1 + fc_actor_input_shape += shape[-1] + + #self.fc_action = nn.Linear(actor_body.get_feature_shape(), action_dim) + self.fc_action = nn.Linear(fc_actor_input_shape, action_dim) if layer_init_fn is not None: self.fc_action = layer_init_fn(self.fc_action, 1e-3) - self.fc_critic = nn.Linear(critic_body.get_feature_shape(), 1) + #self.fc_critic = nn.Linear(critic_body.get_feature_shape(), 1) + self.fc_critic = nn.Linear(fc_critic_input_shape, 1) if layer_init_fn is not None: self.fc_critic = layer_init_fn(self.fc_critic, 1e0) self.use_intrinsic_critic = use_intrinsic_critic self.fc_int_critic = None if self.use_intrinsic_critic: - self.fc_int_critic = nn.Linear(critic_body.get_feature_shape(), 1) + #self.fc_int_critic = nn.Linear(critic_body.get_feature_shape(), 1) + self.fc_int_critic = nn.Linear(fc_critic_input_shape, 1) if layer_init_fn is not None: self.fc_int_critic = layer_init_fn(self.fc_int_critic, 1e-3) @@ -782,6 +834,7 @@ def __init__(self, state_dim, action_dim, phi_body, actor_body, critic_body, use self.phi_params = list(self.phi_body.parameters()) print(self) + for name, param in self.named_parameters(): print(name, param.shape) @@ -849,54 +902,78 @@ def forward(self, obs, action=None, rnn_states=None): return prediction -class CategoricalActorCriticNet(nn.Module): - def __init__(self, - state_dim, - action_dim, - phi_body=None, - actor_body=None, - critic_body=None, - use_intrinsic_critic=False): - super(CategoricalActorCriticNet, self).__init__() - self.use_intrinsic_critic = use_intrinsic_critic - self.state_dim = state_dim - self.action_dim = action_dim - self.network = ActorCriticNet(state_dim, action_dim, phi_body, actor_body, critic_body,use_intrinsic_critic) +class CategoricalActorCriticNet(ActorCriticNet): + def __init__( + self, + state_dim, + action_dim, + phi_body=None, + actor_body=None, + critic_body=None, + use_intrinsic_critic=False, + extra_inputs_infos: Dict={}): + """ + :param extra_inputs_infos: Dictionnary containing the shape of the lstm-relevant extra inputs. + """ + + super(CategoricalActorCriticNet, self).__init__( + state_dim=state_dim, + action_dim=action_dim, + phi_body=phi_body, + actor_body=actor_body, + critic_body=critic_body, + use_intrinsic_critic=use_intrinsic_critic, + extra_inputs_infos=extra_inputs_infos, + ) def forward(self, obs, action=None, rnn_states=None): global EPS + batch_size = obs.shape[0] + next_rnn_states = None if rnn_states is not None: next_rnn_states = {k: None for k in rnn_states} if rnn_states is not None and 'phi_body' in rnn_states: - phi, next_rnn_states['phi_body'] = self.network.phi_body( (obs, rnn_states['phi_body']) ) + phi, next_rnn_states['phi_body'] = self.phi_body( (obs, rnn_states['phi_body']) ) else: - phi = self.network.phi_body(obs) + phi = self.phi_body(obs) if rnn_states is not None and 'actor_body' in rnn_states: - phi_a, next_rnn_states['actor_body'] = self.network.actor_body( (phi, rnn_states['actor_body']) ) + phi_a, next_rnn_states['actor_body'] = self.actor_body( (phi, rnn_states['actor_body']) ) else: - phi_a = self.network.actor_body(phi) + phi_a = self.actor_body(phi) + if 'final_actor_layer' in rnn_states: + extra_inputs = extract_subtree( + in_dict=rnn_states['final_actor_layer'], + node_id='extra_inputs', + ) + + extra_inputs = [v[0].to(phi_a.dtype).to(phi_a.device) for v in extra_inputs.values()] + if len(extra_inputs): phi_a = torch.cat([phi_a]+extra_inputs, dim=-1) + if rnn_states is not None and 'critic_body' in rnn_states: - phi_v, next_rnn_states['critic_body'] = self.network.critic_body( (phi, rnn_states['critic_body']) ) + phi_v, next_rnn_states['critic_body'] = self.critic_body( (phi, rnn_states['critic_body']) ) else: - phi_v = self.network.critic_body(phi) + phi_v = self.critic_body(phi) - logits = self.network.fc_action(phi_a) - probs = F.softmax( logits, dim=-1 ) - #https://github.com/pytorch/pytorch/issues/7014 - #probs = torch.clamp(probs, -1e10, 1e10) + if 'final_critic_layer' in rnn_states: + extra_inputs = extract_subtree( + in_dict=rnn_states['final_critic_layer'], + node_id='extra_inputs', + ) + + extra_inputs = [v[0].to(phi_v.dtype).to(phi_v.device) for v in extra_inputs.values()] + if len(extra_inputs): phi_v = torch.cat([phi_v]+extra_inputs, dim=-1) + # batch x action_dim - v = self.network.fc_critic(phi_v) + v = self.fc_critic(phi_v) if self.use_intrinsic_critic: - int_v = self.network.fc_int_critic(phi_v) + int_v = self.fc_int_critic(phi_v) # batch x 1 - batch_size = logits.size(0) - ''' # RND1 # probs: @@ -914,16 +991,34 @@ def forward(self, obs, action=None, rnn_states=None): ''' ''' # NORMAL: + logits = self.fc_action(phi_a) + probs = F.softmax( logits, dim=-1 ) + #https://github.com/pytorch/pytorch/issues/7014 + #probs = torch.clamp(probs, -1e10, 1e10) #log_probs = F.log_softmax(logits, dim=-1) log_probs = torch.log(probs+EPS) entropy = -torch.sum(probs*log_probs, dim=-1)#, keepdim=True) # batch #x 1 + legal_actions = torch.ones_like(logits) + if 'head' in rnn_states \ + and 'extra_inputs' in rnn_states['head'] \ + and 'legal_actions' in rnn_states['head']['extra_inputs']: + legal_actions = rnn_states['head']['extra_inputs']['legal_actions'][0] + next_rnn_states['head'] = rnn_states['head'] + legal_actions = legal_actions.to(logits.device) + + # The following accounts for player dimension if VDN: + legal_qa = (1+logits-logits.min(dim=-1, keepdim=True)[0]) * legal_actions + + greedy_action = legal_qa.max(dim=-1, keepdim=True)[1] if action is None: #action = (probs+EPS).multinomial(num_samples=1).squeeze(1) - action = torch.multinomial( probs, num_samples=1).squeeze(1) + #action = torch.multinomial( probs, num_samples=1).squeeze(1) + action = torch.multinomial(legal_qa.softmax(dim=-1), num_samples=1)#.reshape((batch_size,)) # batch #x 1 - log_probs = log_probs.gather(1, action.unsqueeze(1)).squeeze(1) + #log_probs = log_probs.gather(1, action.unsqueeze(1)).squeeze(1) + log_probs = log_probs.gather(1, action).squeeze(1) # batch #x 1 ''' ''' @@ -947,17 +1042,22 @@ def forward(self, obs, action=None, rnn_states=None): # batch #x 1 ''' - prediction = {'a': action, - 'log_pi_a': log_probs, - 'action_logits': logits, - 'ent': entropy, - 'v': v} + prediction = { + 'a': action, + 'greedy_action': greedy_action, + 'log_pi_a': log_probs, + 'action_logits': logits, + 'ent': entropy, + 'v': v + } if self.use_intrinsic_critic: prediction['int_v'] = int_v - prediction.update({'rnn_states': rnn_states, - 'next_rnn_states': next_rnn_states}) + prediction.update({ + 'rnn_states': rnn_states, + 'next_rnn_states': next_rnn_states} + ) return prediction diff --git a/regym/rl_algorithms/replay_buffers/storage.py b/regym/rl_algorithms/replay_buffers/storage.py index 652db6d9..41650ffb 100755 --- a/regym/rl_algorithms/replay_buffers/storage.py +++ b/regym/rl_algorithms/replay_buffers/storage.py @@ -2,8 +2,8 @@ class Storage: def __init__(self, keys=None): if keys is None: keys = [] - keys = keys + ['s', 'a', 'r', 'succ_s', 'non_terminal', - 'v', 'q', 'pi', 'log_pi', 'ent', + keys = keys + ['s', 'a', 'r', 'succ_s', 'non_terminal', 'info', + 'v', 'q', 'pi', 'log_pi', 'ent', 'greedy_action', 'adv', 'ret', 'qa', 'log_pi_a', 'mean', 'action_logits'] self.keys = keys diff --git a/regym/rl_algorithms/utils.py b/regym/rl_algorithms/utils.py index 1c96be1a..6b069859 100644 --- a/regym/rl_algorithms/utils.py +++ b/regym/rl_algorithms/utils.py @@ -5,7 +5,7 @@ def is_leaf(node: Dict): - return all([ not isinstance(node[key], dict) for key in node.keys()]) + return any([ not isinstance(node[key], dict) for key in node.keys()]) def recursive_inplace_update( @@ -73,11 +73,22 @@ def copy_hdict(in_dict: Dict): Makes a copy of :param in_dict:. ''' if in_dict is None: return None + out_dict = {key: {} for key in in_dict} + need_reg = False + if isinstance(in_dict, list): + out_dict = {'dummy':{}} + in_dict = {'dummy':in_dict} + need_reg = True + recursive_inplace_update( in_dict=out_dict, extra_dict=in_dict, ) + + if need_reg: + out_dict = out_dict['dummy'] + return out_dict def extract_subtree(in_dict: Dict, @@ -229,30 +240,35 @@ def _concatenate_list_hdict( out_queue.insert(0, out_pointer[k]) else: for k in pointers[0]: - # Previously assigned as a dictionnary in 145 or 165... - out_pointer[k] = [] - # Since we are at a leaf then value is - # either numpy or numpy.float64 - # or list of tensors: - if isinstance(pointers[0][k], list): - for idx in range(len(pointers[0][k])): + try: + # Previously assigned as a dictionnary in 145 or 165... + out_pointer[k] = [] + # Since we are at a leaf then value is + # either numpy or numpy.float64 + # or list of tensors: + if isinstance(pointers[0][k], list): + for idx in range(len(pointers[0][k])): + concat_list = [ + preprocess_fn(pointer[k][idx]) + for pointer in pointers if k in pointer + ] + out_pointer[k].append( + concat_fn(concat_list) + ) + else: concat_list = [ - preprocess_fn(pointer[k][idx]) + preprocess_fn(pointer[k]) for pointer in pointers if k in pointer ] - out_pointer[k].append( - concat_fn(concat_list) - ) - else: - concat_list = [ - preprocess_fn(pointer[k]) - for pointer in pointers if k in pointer - ] - try: - out_pointer[k] = concat_fn(concat_list) - except Exception as e: + try: + out_pointer[k] = concat_fn(concat_list) + except Exception as e: + # the concat_fn may fail, silently... + # e.g.: shape of elements are not all the same... + pass + except Exception as e: # the concat_fn may fail, silently... - # e.g.: shape of elements are not all the same... + # e.g.: neither a list nor a compatible stuff.... pass return out_hd diff --git a/regym/rl_loops/multiagent_loops/marl_loop.py b/regym/rl_loops/multiagent_loops/marl_loop.py index 1d515bf3..28f8a390 100644 --- a/regym/rl_loops/multiagent_loops/marl_loop.py +++ b/regym/rl_loops/multiagent_loops/marl_loop.py @@ -1,3 +1,4 @@ +from typing import Dict, Any, Optional, List import os import math import copy @@ -32,11 +33,20 @@ def interaction(self, *args, **kwargs): #forkedPdb = ForkedPdb() -def run_episode_parallel(env, - agents, - training, - max_episode_length=1e30, - env_configs=None): +def run_episode_parallel( + env, + agents, + training, + max_episode_length=1e30, + env_configs=None, + save_traj=False, + render_mode="rgb_array", + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info"): ''' Runs a single multi-agent rl loop until termination. The observations vector is of length n, where n is the number of agents @@ -50,8 +60,11 @@ def run_episode_parallel(env, N.B.: only care about agent 0's trajectory. ''' - observations, info = env.reset(env_configs=env_configs) - + #observations, info = env.reset(env_configs=env_configs) + env_reset_output_dict = env.reset(env_configs=env_configs) + observations = env_reset_output_dict[obs_key] + info = env_reset_output_dict[info_key] + nbr_actors = env.get_nbr_envs() for agent in agents: @@ -72,7 +85,12 @@ def run_episode_parallel(env, ) for agent_idx, agent in enumerate(agents) ] - succ_observations, reward, done, succ_info = env.step(actions, only_progress_non_terminated=True) + #succ_observations, reward, done, succ_info = env.step(actions, only_progress_non_terminated=True) + env_output_dict = env.step(actions, only_progress_non_terminated=True) + succ_observations = env_output_dict[succ_obs_key] + reward = env_output_dict[reward_key] + done = env_output_dict[done_key] + succ_info = env_output_dict[succ_info_key] if training: for agent_idx, agent in enumerate(agents): @@ -90,7 +108,9 @@ def run_episode_parallel(env, for actor_index in range(nbr_actors): if previous_done[actor_index]: continue - batch_index +=1 + #batch_index +=1 + # since `only_progress_non_terminated=True`: + batch_index = actor_index # Bookkeeping of the actors whose episode just ended: d = done[actor_index] @@ -103,6 +123,9 @@ def run_episode_parallel(env, # Only care about agent 0's trajectory: pa_obs = observations[0][batch_index] + pa_info = info[0][batch_index] + if save_traj: + pa_obs = env.render(render_mode, env_indices=[batch_index])[0] pa_a = actions[0][batch_index] pa_r = reward[0][batch_index] pa_succ_obs = succ_observations[0][batch_index] @@ -115,7 +138,8 @@ def run_episode_parallel(env, if callable(get_intrinsic_reward): pa_int_r = agent.get_intrinsic_reward(actor_index) """ - per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) + if not previous_done[actor_index]: + per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done, pa_info) ) observations = copy.deepcopy(succ_observations) info = copy.deepcopy(succ_info) @@ -148,6 +172,7 @@ def run_episode_parallel(env, return per_actor_trajectories + def test_agent( env, agents, @@ -158,19 +183,54 @@ def test_agent( base_path, nbr_save_traj=1, save_traj=False, - save_traj_length_divider=1): + render_mode="rgb_array", + save_traj_length_divider=1, + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info", + requested_metrics: List[str] = [] + ) -> Optional[Dict]: + ''' + Available metrics to be requested: + - 'total_return', + - 'mean_total_return', + - 'std_ext_return', + - 'total_int_return', + - 'mean_total_int_return', + - 'std_int_return', + - 'episode_lengths', + - 'mean_episode_length', + - 'episode_lengths', + :returns: Dictionary containing values specified in :param: requested_metrics + ''' max_episode_length = 1e4 env.set_nbr_envs(nbr_episode) - trajectory = run_episode_parallel(env, - agents, - training=False, - max_episode_length=max_episode_length, - env_configs=None) + trajectory = run_episode_parallel( + env, + agents, + training=False, + max_episode_length=max_episode_length, + env_configs=None, + save_traj=save_traj, + render_mode=render_mode, + obs_key=obs_key, + succ_obs_key=succ_obs_key, + reward_key=reward_key, + done_key=done_key, + info_key=info_key, + succ_info_key=succ_info_key, + ) total_return = [ sum([ exp[2] for exp in t]) for t in trajectory] + positive_total_return = [ sum([ exp[2] if exp[2]>0 else 0.0 for exp in t]) for t in trajectory] mean_total_return = sum( total_return) / len(trajectory) std_ext_return = math.sqrt( sum( [math.pow( r-mean_total_return ,2) for r in total_return]) / len(total_return) ) + mean_positive_total_return = sum( positive_total_return) / len(trajectory) + std_ext_positive_return = math.sqrt( sum( [math.pow( r-mean_positive_total_return ,2) for r in positive_total_return]) / len(positive_total_return) ) total_int_return = [ sum([ exp[3] for exp in t]) for t in trajectory] mean_total_int_return = sum( total_int_return) / len(trajectory) @@ -179,27 +239,40 @@ def test_agent( #update_count = agent.get_update_count() if sum_writer is not None: - for idx, (ext_ret, int_ret) in enumerate(zip(total_return, total_int_return)): + for idx, (ext_ret, ext_pos_ret, int_ret) in enumerate(zip(total_return, positive_total_return, total_int_return)): sum_writer.add_scalar('PerObservation/Testing/TotalReturn', ext_ret, iteration*len(trajectory)+idx) + sum_writer.add_scalar('PerObservation/Testing/PositiveTotalReturn', ext_pos_ret, iteration*len(trajectory)+idx) sum_writer.add_scalar('PerObservation/Testing/TotalIntReturn', int_ret, iteration*len(trajectory)+idx) sum_writer.add_scalar('PerUpdate/Testing/TotalReturn', ext_ret, update_count) + sum_writer.add_scalar('PerUpdate/Testing/PositiveTotalReturn', ext_pos_ret, update_count) sum_writer.add_scalar('PerUpdate/Testing/TotalIntReturn', int_ret, update_count) sum_writer.add_scalar('PerObservation/Testing/StdIntReturn', std_int_return, iteration) sum_writer.add_scalar('PerObservation/Testing/StdExtReturn', std_ext_return, iteration) + sum_writer.add_scalar('PerObservation/Testing/StdExtPosReturn', std_ext_positive_return, iteration) sum_writer.add_scalar('PerUpdate/Testing/StdIntReturn', std_int_return, update_count) sum_writer.add_scalar('PerUpdate/Testing/StdExtReturn', std_ext_return, update_count) + sum_writer.add_scalar('PerUpdate/Testing/StdExtPosReturn', std_ext_positive_return, update_count) episode_lengths = [ len(t) for t in trajectory] mean_episode_length = sum( episode_lengths) / len(trajectory) std_episode_length = math.sqrt( sum( [math.pow( l-mean_episode_length ,2) for l in episode_lengths]) / len(trajectory) ) + trajectory_metrics = populate_metrics_dictionary( + total_return, mean_total_return, std_ext_return, + total_int_return, mean_total_int_return, std_int_return, + episode_lengths, mean_episode_length, std_episode_length, + requested_metrics + ) + if sum_writer is not None: sum_writer.add_scalar('PerObservation/Testing/MeanTotalReturn', mean_total_return, iteration) + sum_writer.add_scalar('PerObservation/Testing/MeanPositiveTotalReturn', mean_positive_total_return, iteration) sum_writer.add_scalar('PerObservation/Testing/MeanTotalIntReturn', mean_total_int_return, iteration) sum_writer.add_scalar('PerUpdate/Testing/MeanTotalReturn', mean_total_return, update_count) + sum_writer.add_scalar('PerUpdate/Testing/MeanPositiveTotalReturn', mean_positive_total_return, update_count) sum_writer.add_scalar('PerUpdate/Testing/MeanTotalIntReturn', mean_total_int_return, update_count) sum_writer.add_scalar('PerObservation/Testing/MeanEpisodeLength', mean_episode_length, iteration) @@ -224,6 +297,35 @@ def test_agent( end = time.time() eta = end-begin print(f'{actor_idx+1} / {nbr_save_traj} :: Time: {eta} sec.') + return trajectory_metrics + + +def populate_metrics_dictionary(total_return, mean_total_return, std_ext_return, + total_int_return, mean_total_int_return, std_int_return, + episode_lengths, mean_episode_length, std_episode_length, + requested_metrics: List[str] = []) -> Dict[str, Any]: + trajectory_metrics = {} + if 'total_return' in requested_metrics: + trajectory_metrics['total_return'] = total_return + if 'mean_total_return' in requested_metrics: + trajectory_metrics['mean_total_return'] = mean_total_return + if 'std_ext_return' in requested_metrics: + trajectory_metrics['std_ext_return'] = std_ext_return + + if 'total_int_return' in requested_metrics: + trajectory_metrics['total_int_return'] = total_int_return + if 'mean_total_int_return' in requested_metrics: + trajectory_metrics['mean_total_int_return'] = mean_total_int_return + if 'std_int_return' in requested_metrics: + trajectory_metrics['std_int_return'] = std_int_return + + if 'episode_lengths' in requested_metrics: + trajectory_metrics['episode_lengths'] = episode_lengths + if 'mean_episode_length' in requested_metrics: + trajectory_metrics['mean_episode_length'] = mean_episode_length + if 'episode_lengths' in requested_metrics: + trajectory_metrics['std_episode_length'] = std_episode_length + return trajectory_metrics def async_gather_experience_parallel( @@ -424,10 +526,18 @@ def gather_experience_parallel( base_path='./', benchmarking_record_episode_interval=None, save_traj_length_divider=1, + render_mode="rgb_array", step_hooks=[], sad=False, vdn=False, + otherplay=False, nbr_players=2, + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info", ): ''' Runs a self-play multi-agent rl loop until the number of observation, `max_obs_count`, is reached. @@ -451,33 +561,36 @@ def gather_experience_parallel( env = task.env if sad: - env = SADEnvWrapper(env, nbr_actions=task.action_dim) + env = SADEnvWrapper(env, nbr_actions=task.action_dim, otherplay=otherplay) if vdn: env = VDNVecEnvWrapper(env, nbr_players=nbr_players) test_env = task.test_env if sad: - test_env = SADEnvWrapper(test_env, nbr_actions=task.action_dim) + test_env = SADEnvWrapper(test_env, nbr_actions=task.action_dim, otherplay=otherplay) if vdn: test_env = VDNVecEnvWrapper(test_env, nbr_players=nbr_players) - observations, info = env.reset(env_configs=env_configs) - + #observations, info = env.reset(env_configs=env_configs) + env_reset_output_dict = env.reset(env_configs=env_configs) + observations = env_reset_output_dict[obs_key] + info = env_reset_output_dict[info_key] + nbr_actors = env.get_nbr_envs() - """ for agent in agents: agent.set_nbr_actor(nbr_actors) - """ done = [False]*nbr_actors per_actor_trajectories = [list() for i in range(nbr_actors)] trajectories = list() total_returns = list() + positive_total_returns = list() total_int_returns = list() episode_lengths = list() obs_count = agents[0].get_experience_count() if hasattr(agents[0], "get_experience_count") else 0 episode_count = 0 + episode_count_record = 0 sample_episode_count = 0 pbar = tqdm(total=max_obs_count, position=0) @@ -490,7 +603,9 @@ def gather_experience_parallel( if agent.training: agent.algorithm.summary_writer = sum_writer else: - agent.algorithm.summary_writer = None + algo = getattr(agent, "algorithm", None) + if algo is not None: + agent.algorithm.summary_writer = None while True: actions = [ @@ -501,7 +616,12 @@ def gather_experience_parallel( for agent_idx, agent in enumerate(agents) ] - succ_observations, reward, done, succ_info = env.step(actions) + #succ_observations, reward, done, succ_info = env.step(actions) + env_output_dict = env.step(actions) + succ_observations = env_output_dict[succ_obs_key] + reward = env_output_dict[reward_key] + done = env_output_dict[done_key] + succ_info = env_output_dict[succ_info_key] if training: for agent_idx, agent in enumerate(agents): @@ -530,18 +650,43 @@ def gather_experience_parallel( for agent in agents: hook(env, agent, obs_count) + # Only care about agent 0's trajectory: + pa_obs = observations[0][actor_index] + pa_a = actions[0][actor_index] + pa_r = reward[0][actor_index] + pa_succ_obs = succ_observations[0][actor_index] + pa_done = done[actor_index] + pa_int_r = 0.0 + + """ + if getattr(agent.algorithm, "use_rnd", False): + get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) + if callable(get_intrinsic_reward): + pa_int_r = agent.get_intrinsic_reward(actor_index) + """ + per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) + + + #//////////////////////////////////////////////////////////////////////////////////////// # Bookkeeping of the actors whose episode just ended: + #//////////////////////////////////////////////////////////////////////////////////////// done_condition = ('real_done' in succ_info[0][actor_index] and succ_info[0][actor_index]['real_done']) or ('real_done' not in succ_info[0][actor_index] and done[actor_index]) if done_condition: update_count = agents[0].get_update_count() episode_count += 1 - succ_observations, succ_info = env.reset(env_configs=env_configs, env_indices=[actor_index]) + episode_count_record += 1 + #succ_observations, succ_info = env.reset(env_configs=env_configs, env_indices=[actor_index]) + env_reset_output_dict = env.reset(env_configs=config.get('env_configs', None), env_indices=[actor_index]) + succ_observations = env_reset_output_dict[obs_key] + succ_info = env_reset_output_dict[info_key] + for agent_idx, agent in enumerate(agents): agent.reset_actors(indices=[actor_index]) # Logging: trajectories.append(per_actor_trajectories[actor_index]) total_returns.append(sum([ exp[2] for exp in trajectories[-1]])) + positive_total_returns.append(sum([ exp[2] if exp[2]>0 else 0.0 for exp in trajectories[-1]])) total_int_returns.append(sum([ exp[3] for exp in trajectories[-1]])) episode_lengths.append(len(trajectories[-1])) @@ -549,17 +694,25 @@ def gather_experience_parallel( sum_writer.add_scalar('Training/TotalReturn', total_returns[-1], episode_count) sum_writer.add_scalar('PerObservation/TotalReturn', total_returns[-1], obs_count) sum_writer.add_scalar('PerUpdate/TotalReturn', total_returns[-1], update_count) + + sum_writer.add_scalar('Training/PositiveTotalReturn', positive_total_returns[-1], episode_count) + sum_writer.add_scalar('PerObservation/PositiveTotalReturn', positive_total_returns[-1], obs_count) + sum_writer.add_scalar('PerUpdate/PositiveTotalReturn', positive_total_returns[-1], update_count) + if actor_index == 0: sample_episode_count += 1 #sum_writer.add_scalar(f'data/reward_{actor_index}', total_returns[-1], sample_episode_count) - sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_PositiveReward', positive_total_returns[-1], obs_count) #sum_writer.add_scalar(f'PerUpdate/Actor_{actor_index}_Reward', total_returns[-1], update_count) - sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) + #sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) sum_writer.flush() if len(trajectories) >= nbr_actors: mean_total_return = sum( total_returns) / len(trajectories) std_ext_return = math.sqrt( sum( [math.pow( r-mean_total_return ,2) for r in total_returns]) / len(total_returns) ) + mean_positive_total_return = sum( positive_total_returns) / len(trajectories) + std_ext_positive_return = math.sqrt( sum( [math.pow( r-mean_positive_total_return ,2) for r in positive_total_returns]) / len(positive_total_returns) ) mean_total_int_return = sum( total_int_returns) / len(trajectories) std_int_return = math.sqrt( sum( [math.pow( r-mean_total_int_return ,2) for r in total_int_returns]) / len(total_int_returns) ) mean_episode_length = sum( episode_lengths) / len(trajectories) @@ -572,6 +725,9 @@ def gather_experience_parallel( sum_writer.add_scalar('Training/MeanTotalReturn', mean_total_return, episode_count // nbr_actors) sum_writer.add_scalar('PerObservation/MeanTotalReturn', mean_total_return, obs_count) sum_writer.add_scalar('PerUpdate/MeanTotalReturn', mean_total_return, update_count) + sum_writer.add_scalar('Training/MeanPositiveTotalReturn', mean_positive_total_return, episode_count // nbr_actors) + sum_writer.add_scalar('PerObservation/MeanPositiveTotalReturn', mean_positive_total_return, obs_count) + sum_writer.add_scalar('PerUpdate/MeanPositiveTotalReturn', mean_positive_total_return, update_count) sum_writer.add_scalar('Training/MeanTotalIntReturn', mean_total_int_return, episode_count // nbr_actors) sum_writer.add_scalar('Training/MeanEpisodeLength', mean_episode_length, episode_count // nbr_actors) @@ -585,32 +741,23 @@ def gather_experience_parallel( # reset : trajectories = list() total_returns = list() + positive_total_returns = list() total_int_returns = list() episode_lengths = list() per_actor_trajectories[actor_index] = list() - # Only care about agent 0's trajectory: - pa_obs = observations[0][actor_index] - pa_a = actions[0][actor_index] - pa_r = reward[0][actor_index] - pa_succ_obs = succ_observations[0][actor_index] - pa_done = done[actor_index] - pa_int_r = 0.0 - - """ - if getattr(agent.algorithm, "use_rnd", False): - get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) - if callable(get_intrinsic_reward): - pa_int_r = agent.get_intrinsic_reward(actor_index) - """ - per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) - + #//////////////////////////////////////////////////////////////////////////////////////// + #//////////////////////////////////////////////////////////////////////////////////////// if test_nbr_episode != 0 and obs_count % test_obs_interval == 0: save_traj = False if (benchmarking_record_episode_interval is not None and benchmarking_record_episode_interval>0): - save_traj = (obs_count%benchmarking_record_episode_interval==0) + #save_traj = (obs_count%benchmarking_record_episode_interval==0) + save_traj = (episode_count_record // nbr_actors > benchmarking_record_episode_interval) + if save_traj: + episode_count_record = 0 + # TECHNICAL DEBT: clone_agent.get_update_count is failing because the update count param is None # haven't figured out why is the cloning function making it None... test_agent( @@ -622,6 +769,7 @@ def gather_experience_parallel( iteration=obs_count, base_path=base_path, save_traj=save_traj, + render_mode=render_mode, save_traj_length_divider=save_traj_length_divider ) @@ -639,4 +787,4 @@ def gather_experience_parallel( return agent -gather_experience_parallel_ray = ray.remote(gather_experience_parallel) \ No newline at end of file +gather_experience_parallel_ray = ray.remote(gather_experience_parallel) diff --git a/regym/rl_loops/singleagent_loops/rl_loop.py b/regym/rl_loops/singleagent_loops/rl_loop.py index 1fb1d2eb..9ab73589 100755 --- a/regym/rl_loops/singleagent_loops/rl_loop.py +++ b/regym/rl_loops/singleagent_loops/rl_loop.py @@ -55,11 +55,20 @@ def run_episode(env, agent, training, max_episode_length=math.inf): return trajectory -def run_episode_parallel(env, - agent, - training, - max_episode_length=1e30, - env_configs=None): +def run_episode_parallel( + env, + agent, + training, + max_episode_length=1e30, + env_configs=None, + save_traj=False, + render_mode="rgb_array", + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info"): ''' Runs a single multi-agent rl loop until termination. The observations vector is of length n, where n is the number of agents @@ -71,8 +80,11 @@ def run_episode_parallel(env, :param env_configs: configuration dictionnary to use when resetting the environments. :returns: Trajectory (o,a,r,o') ''' - observations, info = env.reset(env_configs=env_configs) - + #observations, info = env.reset(env_configs=env_configs) + env_reset_output_dict = env.reset(env_configs=env_configs) + observations = env_reset_output_dict[obs_key] + info = env_reset_output_dict[info_key] + nbr_actors = env.get_nbr_envs() agent.set_nbr_actor(nbr_actors) agent.reset_actors() @@ -83,12 +95,18 @@ def run_episode_parallel(env, #generator = tqdm(range(int(max_episode_length))) if max_episode_length != math.inf else range(int(1e20)) #for step in generator: for step in range(int(max_episode_length)): + realdone = [] action = agent.take_action( state=observations, infos=info ) - succ_observations, reward, done, succ_info = env.step(action, only_progress_non_terminated=True) + #succ_observations, reward, done, succ_info = env.step(action, only_progress_non_terminated=True) + env_output_dict = env.step(actions, only_progress_non_terminated=True) + succ_observations = env_output_dict[succ_obs_key] + reward = env_output_dict[reward_key] + done = env_output_dict[done_key] + succ_info = env_output_dict[succ_info_key] if training: agent.handle_experience( @@ -105,27 +123,37 @@ def run_episode_parallel(env, for actor_index in range(nbr_actors): if previous_done[actor_index]: continue - batch_index +=1 + #batch_index +=1 + # since `only_progress_non_terminated=True`: + batch_index = actor_index # Bookkeeping of the actors whose episode just ended: d = done[actor_index] if ('real_done' in succ_info[actor_index]): d = succ_info[actor_index]['real_done'] + realdone.append(d) if d and not(previous_done[actor_index]): batch_idx_done_actors_among_not_done.append(batch_index) pa_obs = observations[batch_index] + pa_info = info[0][batch_index] + if save_traj: + pa_obs = env.render(render_mode, env_indices=[batch_index])[0] + pa_a = action[batch_index] pa_r = reward[batch_index] pa_succ_obs = succ_observations[batch_index] pa_done = done[actor_index] pa_int_r = 0.0 + if getattr(agent.algorithm, "use_rnd", False): get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) if callable(get_intrinsic_reward): pa_int_r = agent.get_intrinsic_reward(actor_index) - per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) + + if not previous_done[actor_index]: + per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done, pa_info) ) observations = copy.deepcopy(succ_observations) info = copy.deepcopy(succ_info) @@ -147,22 +175,50 @@ def run_episode_parallel(env, if info[idx] is None: del info[idx] """ - if len(info): - allrealdone = all([i['real_done'] if 'real_done' in i else False for i in info]) + if len(realdone): + allrealdone = all(realdone) if alldone or allrealdone: break return per_actor_trajectories -def test_agent(env, agent, update_count, nbr_episode, sum_writer, iteration, base_path, nbr_save_traj=1, save_traj=False): +def test_agent( + env, + agent, + update_count, + nbr_episode, + sum_writer, + iteration, + base_path, + nbr_save_traj=1, + save_traj=False, + render_mode="rgb_array", + save_traj_length_divider=1, + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info", + ): max_episode_length = 1e4 env.set_nbr_envs(nbr_episode) - trajectory = run_episode_parallel(env, - agent, - training=False, - max_episode_length=max_episode_length, - env_configs=None) + trajectory = run_episode_parallel( + env, + agent, + training=False, + max_episode_length=max_episode_length, + env_configs=None, + save_traj=save_traj, + render_mode=render_mode, + obs_key=obs_key, + succ_obs_key=succ_obs_key, + reward_key=reward_key, + done_key=done_key, + info_key=info_key, + succ_info_key=succ_info_key, + ) total_return = [ sum([ exp[2] for exp in t]) for t in trajectory] mean_total_return = sum( total_return) / len(trajectory) @@ -209,7 +265,14 @@ def test_agent(env, agent, update_count, nbr_episode, sum_writer, iteration, bas gif_traj = [ exp[0] for exp in trajectory[actor_idx]] gif_data = [np.cumsum([ exp[2] for exp in trajectory[actor_idx]])] begin = time.time() - save_traj_with_graph(gif_traj, gif_data, episode=iteration, actor_idx=actor_idx, path=base_path) + save_traj_with_graph( + gif_traj, + gif_data, + divider=save_traj_length_divider, + episode=iteration, + actor_idx=actor_idx, + path=base_path + ) end = time.time() eta = end-begin print(f'{actor_idx+1} / {nbr_save_traj} :: Time: {eta} sec.') @@ -228,6 +291,7 @@ def async_gather_experience_parallel( base_path='./', benchmarking_record_episode_interval=None, step_hooks=[]): + raise NotImplementedError('needs to be updated with dictionnary output from vec_env..') ''' Runs a single multi-agent rl loop until the number of observation, `max_obs_count`, is reached. The observations vector is of length n, where n is the number of agents. @@ -309,6 +373,7 @@ def async_gather_experience_parallel1( base_path='./', benchmarking_record_episode_interval=None, step_hooks=[]): + raise NotImplementedError('needs to be updated with dictionnary output from vec_env..') ''' Runs a single multi-agent rl loop until the number of observation, `max_obs_count`, is reached. The observations vector is of length n, where n is the number of agents. @@ -385,17 +450,30 @@ def learner_loop( return agent #@ray.remote(num_gpus=0.5) -def gather_experience_parallel(task, - agent, - training, - max_obs_count=1e7, - test_obs_interval=1e4, - test_nbr_episode=10, - env_configs=None, - sum_writer=None, - base_path='./', - benchmarking_record_episode_interval=None, - step_hooks=[]): +def gather_experience_parallel( + task, + agent, + training, + max_obs_count=1e7, + test_obs_interval=1e4, + test_nbr_episode=10, + env_configs=None, + sum_writer=None, + base_path='./', + benchmarking_record_episode_interval=None, + save_traj_length_divider=1, + render_mode="rgb_array", + step_hooks=[], + sad=False, + vdn=False, + nbr_players=2, + obs_key="observations", + succ_obs_key="succ_observations", + reward_key="reward", + done_key="done", + info_key="info", + succ_info_key="succ_info", + ): ''' Runs a single multi-agent rl loop until the number of observation, `max_obs_count`, is reached. The observations vector is of length n, where n is the number of agents. @@ -415,7 +493,10 @@ def gather_experience_parallel(task, env = task.env test_env = task.test_env - observations, info = env.reset(env_configs=env_configs) + #observations, info = env.reset(env_configs=env_configs) + env_reset_output_dict = env.reset(env_configs=env_configs) + observations = env_reset_output_dict[obs_key] + info = env_reset_output_dict[info_key] nbr_actors = env.get_nbr_envs() agent.set_nbr_actor(nbr_actors) @@ -438,15 +519,26 @@ def gather_experience_parallel(task, if isinstance(sum_writer, str): sum_writer_path = os.path.join(sum_writer, 'actor.log') sum_writer = SummaryWriter(sum_writer_path, flush_secs=1) - agent.algorithm.summary_writer = sum_writer - + #agent.algorithm.summary_writer = sum_writer + if agent.training: + agent.algorithm.unwrapped.summary_writer = sum_writer + else: + algo = getattr(agent, "algorithm", None) + if algo is not None: + agent.algorithm.unwrapped.summary_writer = None + while True: action = agent.take_action( state=observations, infos=info ) - succ_observations, reward, done, succ_info = env.step(action) + #succ_observations, reward, done, succ_info = env.step(action) + env_output_dict = env.step(action) + succ_observations = env_output_dict[succ_obs_key] + reward = env_output_dict[reward_key] + done = env_output_dict[done_key] + succ_info = env_output_dict[succ_info_key] if training: agent.handle_experience( @@ -465,7 +557,23 @@ def gather_experience_parallel(task, for hook in step_hooks: hook(env, agent, obs_count) + pa_obs = observations[actor_index] + pa_a = action[actor_index] + pa_r = reward[actor_index] + pa_succ_obs = succ_observations[actor_index] + pa_done = done[actor_index] + pa_int_r = 0.0 + + if getattr(agent.algorithm, "use_rnd", False): + get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) + if callable(get_intrinsic_reward): + pa_int_r = agent.get_intrinsic_reward(actor_index) + + per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) + + #//////////////////////////////////////////////////////////////////////////////////////// # Bookkeeping of the actors whose episode just ended: + #//////////////////////////////////////////////////////////////////////////////////////// if done[actor_index]: agent.reset_actors(indices=[actor_index]) @@ -473,17 +581,11 @@ def gather_experience_parallel(task, if done_condition: update_count = agent.get_update_count() episode_count += 1 - ################# - # Previously: - ################# - #succ_observations[actor_index], succ_info[actor_index] = env.reset(env_configs=env_configs, env_indices=[actor_index]) - # account for list formatting of infos: - #succ_info[actor_index] = succ_info[actor_index][0] - #agent.reset_actors(indices=[actor_index]) - ################# - # New: - ################# - succ_observations, succ_info = env.reset(env_configs=env_configs, env_indices=[actor_index]) + #succ_observations, succ_info = env.reset(env_configs=env_configs, env_indices=[actor_index]) + env_reset_output_dict = env.reset(env_configs=env_configs, env_indices=[actor_index]) + succ_observations = env_reset_output_dict[obs_key] + succ_info = env_reset_output_dict[info_key] + agent.reset_actors(indices=[actor_index]) ################# @@ -499,10 +601,10 @@ def gather_experience_parallel(task, sum_writer.add_scalar('PerUpdate/TotalReturn', total_returns[-1], update_count) if actor_index == 0: sample_episode_count += 1 - sum_writer.add_scalar(f'data/reward_{actor_index}', total_returns[-1], sample_episode_count) - sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) - sum_writer.add_scalar(f'PerUpdate/Actor_{actor_index}_Reward', total_returns[-1], update_count) - sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) + ##sum_writer.add_scalar(f'data/reward_{actor_index}', total_returns[-1], sample_episode_count) + #sum_writer.add_scalar(f'PerObservation/Actor_{actor_index}_Reward', total_returns[-1], obs_count) + #sum_writer.add_scalar(f'PerUpdate/Actor_{actor_index}_Reward', total_returns[-1], update_count) + #sum_writer.add_scalar('Training/TotalIntReturn', total_int_returns[-1], episode_count) sum_writer.flush() if len(trajectories) >= nbr_actors: @@ -538,19 +640,8 @@ def gather_experience_parallel(task, per_actor_trajectories[actor_index] = list() - pa_obs = observations[actor_index] - pa_a = action[actor_index] - pa_r = reward[actor_index] - pa_succ_obs = succ_observations[actor_index] - pa_done = done[actor_index] - pa_int_r = 0.0 - - if getattr(agent.algorithm, "use_rnd", False): - get_intrinsic_reward = getattr(agent, "get_intrinsic_reward", None) - if callable(get_intrinsic_reward): - pa_int_r = agent.get_intrinsic_reward(actor_index) - per_actor_trajectories[actor_index].append( (pa_obs, pa_a, pa_r, pa_int_r, pa_succ_obs, pa_done) ) - + #//////////////////////////////////////////////////////////////////////////////////////// + #//////////////////////////////////////////////////////////////////////////////////////// if test_nbr_episode != 0 and obs_count % test_obs_interval == 0: save_traj = False @@ -558,14 +649,18 @@ def gather_experience_parallel(task, save_traj = (obs_count%benchmarking_record_episode_interval==0) # TECHNICAL DEBT: clone_agent.get_update_count is failing because the update count param is None # haven't figured out why is the cloning function making it None... - test_agent(env=test_env, - agent=agent.clone(training=False), - update_count=agent.get_update_count(), - nbr_episode=test_nbr_episode, - sum_writer=sum_writer, - iteration=obs_count, - base_path=base_path, - save_traj=save_traj) + test_agent( + env=test_env, + agent=agent.clone(training=False), + update_count=agent.get_update_count(), + nbr_episode=test_nbr_episode, + sum_writer=sum_writer, + iteration=obs_count, + base_path=base_path, + save_traj=save_traj, + render_mode=render_mode, + save_traj_length_divider=save_traj_length_divider + ) observations = copy.deepcopy(succ_observations) info = copy.deepcopy(succ_info) diff --git a/regym/thirdparty/ReferentialGym b/regym/thirdparty/ReferentialGym new file mode 160000 index 00000000..b38650ce --- /dev/null +++ b/regym/thirdparty/ReferentialGym @@ -0,0 +1 @@ +Subproject commit b38650ce2a170cde3159358cd192c95fdfc148ed diff --git a/regym/util/__init__.py b/regym/util/__init__.py index 0eb7cfa9..e46f0abc 100755 --- a/regym/util/__init__.py +++ b/regym/util/__init__.py @@ -1,5 +1,5 @@ from .play_matches import play_single_match, play_multiple_matches from .play_matches import extract_winner from .utils import save_traj_with_graph -from .minerl import * +#from .minerl import * from .wrappers import * diff --git a/regym/util/experiment_parsing.py b/regym/util/experiment_parsing.py index 3bd29910..6dd2e705 100755 --- a/regym/util/experiment_parsing.py +++ b/regym/util/experiment_parsing.py @@ -9,7 +9,9 @@ from regym.training_schemes import DeltaDistributionalSelfPlay from regym.rl_algorithms import build_DQN_Agent +from regym.rl_algorithms import build_DQN_HER_Agent from regym.rl_algorithms import build_THER_Agent +from regym.rl_algorithms import build_THER2_Agent from regym.rl_algorithms import build_R2D2_Agent from regym.rl_algorithms import build_R2D3_Agent from regym.rl_algorithms import build_TabularQ_Agent @@ -62,7 +64,9 @@ def initialize_agents(task, agent_configurations): ''' def partial_match_build_function(agent_name, task, config): if 'tabularqlearning' in agent_name.lower(): return build_TabularQ_Agent(task, config, agent_name) + if 'dqnher' in agent_name.lower(): return build_DQN_HER_Agent(task, config, agent_name) if 'dqn' in agent_name.lower(): return build_DQN_Agent(task, config, agent_name) + if 'ther2' in agent_name.lower(): return build_THER2_Agent(task, config, agent_name) if 'ther' in agent_name.lower(): return build_THER_Agent(task, config, agent_name) if 'r2d2' in agent_name.lower(): return build_R2D2_Agent(task, config, agent_name) if 'r2d3' in agent_name.lower(): return build_R2D3_Agent(task, config, agent_name) diff --git a/regym/util/minerl/action_discretisation.py b/regym/util/minerl/action_discretisation.py index 9a05a02a..ff3f0588 100755 --- a/regym/util/minerl/action_discretisation.py +++ b/regym/util/minerl/action_discretisation.py @@ -1,7 +1,5 @@ from typing import Callable, List, Dict, Union -from sklearn.cluster import KMeans -from sklearn.metrics import pairwise_distances import numpy as np import minerl @@ -93,6 +91,7 @@ def get_kmeans_actions(env:str,path:str,trajectory_names:np.ndarray,n_clusters:i :returns: - kmeans_actions: Numpy array of actions found by kmeans ''' + from sklearn.cluster import KMeans data = minerl.data.make(env,path) actions = [] @@ -119,7 +118,8 @@ def get_action_set(env:str,path:str,n_clusters:int,score_percent:float=0.9,agree :returns: - actions_set: Numpy array of actions found by kmeans and inventory actions ''' - + from sklearn.metrics import pairwise_distances + good_demos = get_good_demo_names(env,path,score_percent) inventory_actions = get_inventory_actions(env,path,good_demos,agreement_percent) kmeans_actions = get_kmeans_actions(env,path,good_demos,n_clusters) diff --git a/regym/util/utils.py b/regym/util/utils.py index a8200c0a..430c6b23 100644 --- a/regym/util/utils.py +++ b/regym/util/utils.py @@ -5,7 +5,10 @@ import matplotlib.animation as anim import os -def save_traj_with_graph(trajectory, data, episode=0, actor_idx=0, path='./', divider=10, colors=['blue', 'green', 'red', 'yellow', 'orange', 'black', 'grey'], markers=['o', 's', 'p', 'P', '*', 'h', 'H']): +from celluloid import Camera + + +def save_traj_with_graph_depr(trajectory, data, episode=0, actor_idx=0, path='./', divider=10, colors=['blue', 'green', 'red', 'yellow', 'orange', 'black', 'grey'], markers=['o', 's', 'p', 'P', '*', 'h', 'H']): path = './'+path fig = plt.figure() imgs = [] @@ -60,3 +63,58 @@ def save_traj_with_graph(trajectory, data, episode=0, actor_idx=0, path='./', di print(f"Issue while saving trajectory: {e}") plt.close(fig) + + +def save_traj_with_graph(trajectory, data, episode=0, actor_idx=0, path='./', divider=10, colors=['blue', 'green', 'red', 'yellow', 'orange', 'black', 'grey'], markers=['o', 's', 'p', 'P', '*', 'h', 'H']): + path = './'+path + fig, axes = plt.subplots(2) + camera = Camera(fig) + + imgs = [] + gd = [[]]*len(data) + for idx, state in enumerate(trajectory): + if state.shape[-1] != 3: + # handled Stacked images... + img_ch = 3 + if state.shape[-1] % 3: img_ch = 1 + per_image_first_channel_indices = range(0,state.shape[-1]+1,img_ch) + ims = [ state[...,idx_begin:idx_end] for idx_begin, idx_end in zip(per_image_first_channel_indices,per_image_first_channel_indices[1:])] + for img in ims: + imgs.append(img.squeeze()) + for didx, d in enumerate(data): + gd[didx].append(d[idx]) + else: + imgs.append(state) + for didx, d in enumerate(data): + gd[didx].append(d[idx]) + gifimgs = [] + for idx, img in enumerate(imgs): + if idx%divider: continue + + axes[0].imshow(img) + + axes[1].set_xlim(left=0,right=idx+10) + for didx, d in enumerate(gd): + x = np.arange(0,idx,1) + y = np.asarray(d[:idx]) + axes[1].plot( + x, + y, + color=colors[didx%len(colors)], + marker=markers[didx%len(markers)], + linestyle='dashed', + linewidth=1, + markersize=2 + ) + + camera.snap() + + gif = camera.animate() + #gif = anim.ArtistAnimation(fig, gifimgs, interval=200, blit=True, repeat_delay=None) + path = os.path.join(path, f'./traj_ep{episode}_actor{actor_idx}.mp4') + try: + gif.save(path, dpi=None, writer='imagemagick') + except Exception as e: + print(f"Issue while saving trajectory: {e}") + + plt.close(fig) diff --git a/regym/util/wrappers.py b/regym/util/wrappers.py index a90a8a1e..19d7bef9 100644 --- a/regym/util/wrappers.py +++ b/regym/util/wrappers.py @@ -100,17 +100,25 @@ def get_nbr_envs(self): def set_nbr_envs(self, nbr_envs): self.env.set_nbr_envs(nbr_envs) + def render(self, render_mode="rgb_array", env_indices=None) : + return self.env.render(render_mode=render_mode, env_indices=env_indices) + + def close(self): + return self.env.close() + def reset(self, **kwargs): - next_obs, next_infos = self.env.reset(**kwargs) - + input_dict = self.env.reset(**kwargs) + nvdn_next_obs = input_dict["observations"] + nvdn_next_infos = input_dict["info"] + vdn_obs = np.concatenate( - next_obs, + nvdn_next_obs, axis=0 ) next_obs = [vdn_obs] list_infos = [] - for li in next_infos: + for li in nvdn_next_infos: for k in range(len(li)): list_infos.append(li[k]) """ @@ -124,7 +132,15 @@ def reset(self, **kwargs): """ next_infos = [list_infos] - return next_obs, next_infos + output_dict = { + "observations":nvdn_next_obs, + "info":nvdn_next_infos, + + "vdn_observations":next_obs, + "vdn_info":next_infos, + } + + return output_dict def step(self, action, **kwargs): assert isinstance(action, list) and len(action)==1, "action argument must be a singleton list of dictionnary (SAD) or tensor." @@ -150,27 +166,34 @@ def step(self, action, **kwargs): a = action[0][pidx*nbr_env:(pidx+1)*nbr_env, ...] env_action.append(a) - next_obs, reward, done, next_infos = self.env.step(env_action, **kwargs) - + nonvdn_action = env_action + env_output_dict = self.env.step(env_action, **kwargs) + if "actions" in env_output_dict: + nonvdn_action = env_output_dict["actions"] + nvdn_next_obs = env_output_dict["succ_observations"] + nvdn_reward = env_output_dict["reward"] + nvdn_done = env_output_dict["done"] + nvdn_next_infos = env_output_dict["succ_info"] + next_obs = [ np.concatenate( - next_obs, + nvdn_next_obs, axis=0 ) ] # 1 x (batch_size*num_player, ...) - reward_shape = reward[0].shape + reward_shape = nvdn_reward[0].shape reward = [ np.concatenate( - reward, + nvdn_reward, axis=0 ) ] # 1 x (batch_size*num_player, ...) list_infos = [] - for li in next_infos: + for li in nvdn_next_infos: for k in range(len(li)): list_infos.append(li[k]) @@ -185,7 +208,21 @@ def step(self, action, **kwargs): # 1 x key x (batch_size*num_player, ...) """ - return next_obs, reward, done, next_infos + output_dict = { + "actions":nonvdn_action, + + "succ_observations":nvdn_next_obs, + "reward":nvdn_reward, + "done":nvdn_done, + "succ_info":nvdn_next_infos, + + "vdn_succ_observations":next_obs, + "vdn_reward":reward, + "vdn_done":nvdn_done, + "vdn_succ_info":next_infos + } + + return output_dict # # Wrappers: @@ -1687,6 +1724,12 @@ def get_nbr_envs(self): def set_nbr_envs(self, nbr_envs): self.env.set_nbr_envs(nbr_envs) + def render(self, render_mode="rgb_array", env_indices=None): + return self.env.render(render_mode=render_mode, env_indices=env_indices) + + def close(self): + return self.env.close() + def reset(self, **kwargs): next_obs, next_infos = self.env.reset(**kwargs) @@ -1725,7 +1768,7 @@ def step(self, action, **kwargs): return next_obs, reward, done, next_infos class SADVecEnvWrapper(object): - def __init__(self, env, nbr_actions): + def __init__(self, env, nbr_actions, otherplay=False): """ Simplified Action Decoder wrapper expects the action argument for the step method to be a list of dictionnary containing the following keys: @@ -1738,6 +1781,7 @@ def __init__(self, env, nbr_actions): an extra player_offset tensor. """ self.env = env + self.otherplay=otherplay self.nbr_actions = nbr_actions self.nbr_players = None self.current_player_idx = None @@ -1748,11 +1792,21 @@ def get_nbr_envs(self): def set_nbr_envs(self, nbr_envs): self.env.set_nbr_envs(nbr_envs) - def reset(self, **kwargs): - next_obs, next_infos = self.env.reset(**kwargs) + def render(self, render_mode="rgb_array", env_indices=None): + return self.env.render(render_mode=render_mode, env_indices=env_indices) + + def close(self): + return self.env.close() + def reset(self, **kwargs): + input_dict = self.env.reset(**kwargs) + next_obs = input_dict["observations"] + next_infos = input_dict["info"] + self.nbr_players = len(next_obs) - self.current_player_idx = [i["current_player"].item() for i in next_infos[0]] + self.current_player_idx = None + if 'current_player' in next_infos[0][0]: + self.current_player_idx = [i["current_player"].item() for i in next_infos[0]] # (nbr_env, ) for player_idx in range(2): @@ -1764,7 +1818,12 @@ def reset(self, **kwargs): axis=-1, ) - return next_obs, next_infos + output_dict = { + "observations":next_obs, + "info":next_infos, + } + + return output_dict def step(self, action, **kwargs): assert isinstance(action, list), "action argument must be a list of dictionnary (or tensors if test-time...)." @@ -1776,15 +1835,46 @@ def step(self, action, **kwargs): else: env_action = action - next_obs, reward, done, next_infos = self.env.step(env_action, **kwargs) - + #next_obs, reward, done, next_infos = self.env.step(env_action, **kwargs) + env_output_dict = self.env.step(env_action, **kwargs) + next_obs = env_output_dict["succ_observations"] + reward = env_output_dict["reward"] + done = env_output_dict["done"] + next_infos = env_output_dict["succ_info"] + for player_idx in range(self.nbr_players): for env_idx in range(len(next_infos[player_idx])): - current_player = self.current_player_idx[env_idx] + current_player = None + if self.current_player_idx is not None: + current_player = self.current_player_idx[env_idx] + else: + # assuming self.nbr_players==2... + current_player = self.nbr_players-(player_idx+1) relative_current_player_idx = (current_player-player_idx) % self.nbr_players if isinstance(action[0], dict): #ga = action[other_idx]["greedy_action"][env_idx] ga = action[current_player]["greedy_action"][env_idx] + if self.otherplay: + # expects env to be wrapped with DiscreteCombinedActionWrapper: + dcaw_env = self.env.env_processes[env_idx] + while not hasattr(dcaw_env, "_decode_action"): + dcaw_env = dcaw_env.env + # expects other play wrapper: + ow_env = dcaw_env.env + # decode current player's action in the original env: + decoded_ga = ow_env._decode_action( + action=dcaw_env._decode_action(ga), + player_id=current_player, + ) + # encode current player's action into other player's env: + otherplayer_encoded_ga = dcaw_env._encode_action( + action_dict=ow_env._encode_action( + action=decoded_ga, + player_id=player_idx, # other player's view point + ) + ) + # int + ga = otherplayer_encoded_ga else: #ga = action[other_idx][env_idx] ga = action[current_player][env_idx] @@ -1798,10 +1888,22 @@ def step(self, action, **kwargs): ) # update: - self.current_player_idx = [i["current_player"].item() for i in next_infos[0]] + if 'current_player' in next_infos[0][0]: + self.current_player_idx = [i["current_player"].item() for i in next_infos[0]] + else: + self.current_player_idx = None # (nbr_env, ) - return next_obs, reward, done, next_infos + output_dict = { + "actions":env_action, #non-sad actions + + "succ_observations":next_obs, + "reward":reward, + "done":done, + "succ_info":next_infos, + } + + return output_dict diff --git a/setup.py b/setup.py index 8109c860..200ce887 100755 --- a/setup.py +++ b/setup.py @@ -14,10 +14,10 @@ setup( name='regym', version='0.0.1', - description='Framework to carry out (Multi Agent) Reinforcement Learning experiments. Developed by PhD heros at the University of York.', + description='Framework to carry out (Multi-Agent) Deep Reinforcement Learning experiments.', long_description=long_description, long_description_content_type='text/markdown', - url='https://github.com/Danielhp95/Generalized-RL-Self-Play-Framework', + url='https://github.com/Danielhp95/Regym', author='IGGI PhD Programme', author_email='danielhp95@gmail.com', @@ -30,20 +30,24 @@ packages=find_packages(), zip_safe=False, - install_requires=['gym', - 'matplotlib', - 'docopt', - 'pyyaml', - 'pip', - 'tensorboardx', - 'opencv-python', - 'torch', - 'torchvision', - 'cvxopt', - 'scipy', - 'minerl', - 'sklearn', - 'seaborn'] + test_requirements, + install_requires=[ + 'gym', + 'ray', + 'matplotlib', + 'docopt', + 'pyyaml', + 'pip', + 'tensorboardx', + 'opencv-python', + 'torch==1.8.1', + 'torchvision', + 'cvxopt', + 'scipy', + #'minerl', + 'celluloid', + 'sklearn', + 'seaborn' + ] + test_requirements, python_requires=">=3.6", )