Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hindsight PRIOR #12

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions scripts/configs/hprior_awac/adroit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
algorithm:
class: Hindsight_PRIOR_AWAC
beta: 0.3333
max_exp_clip: 100.0
reward_reg: 0.0
max_seq_len: 100
world_steps: 10000
prior_coef: 1000
rm_label: true

checkpoint: null
seed: 0
name: default
debug: false
device: null
wandb:
activate: false
entity: null
project: null

env: pen-cloned-v1
env_kwargs:
env_wrapper:
env_wrapper_kwargs:

optim:
default:
class: Adam
lr: 0.0003

network:
world:
embed_dim: 256
num_layers: 3
num_heads: 1
reward:
class: EnsembleMLP
ensemble_size: 1
hidden_dims: [256, 256, 256]
reward_act: identity
actor:
class: SquashedGaussianActor
hidden_dims: [256, 256, 256]
reparameterize: false
conditioned_logstd: false
logstd_min: -5
logstd_max: 2
critic:
class: Critic
ensemble_size: 2
hidden_dims: [256, 256, 256]

rm_dataset:
- class: D4RLOfflineDataset
env: pen-cloned-v1
batch_size: 64
mode: trajectory
segment_length: 100
padding_mode: none
- class: IPLComparisonOfflineDataset
env: pen-cloned-v1
batch_size: 8
mode: human
rm_dataloader:
num_workers: 2
batch_size: null

rl_dataset:
- class: D4RLOfflineDataset
env: pen-cloned-v1
batch_size: 256
mode: transition
reward_normalize: true
rl_dataloader:
num_workers: 2
batch_size: null

trainer:
env_freq: null
rm_label: true
rm_steps: 60000
rl_steps: 500000
log_freq: 500
profile_freq: 500
eval_freq: 5000

rm_eval:
function: eval_world_model_and_reward_model
eval_dataset_kwargs:
class: IPLComparisonOfflineDataset
env: pen-cloned-v1
batch_size: 32
mode: human
eval: false
rl_eval:
function: eval_offline
num_ep: 10
deterministic: true

schedulers:
actor:
class: CosineAnnealingLR
T_max: 500000

processor: null
106 changes: 106 additions & 0 deletions scripts/configs/hprior_awac/gym.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
algorithm:
class: Hindsight_PRIOR_AWAC
beta: 0.3333
max_exp_clip: 100.0
reward_reg: 0.0
max_seq_len: 100
world_steps: 10000
prior_coef: 1000
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems too huge, could you record the scale of rm_loss and prior_loss and see whether this term actually dominates the reward learning? By the way, what's the default value of prior_coef in authors' impl.?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Origin paper: 1000 (MetaWorld), 5 (DMC)

Record:

prior_coef rm_loss prior_loss
1 0.3 0.015
100 0.5 1e-4
1000 0.7 5e-6

rm_label: true

checkpoint: null
seed: 0
name: default
debug: false
device: null
wandb:
activate: false
entity: null
project: null

env: hopper-medium-replay-v2
env_kwargs:
env_wrapper:
env_wrapper_kwargs:

optim:
default:
class: Adam
lr: 0.0003

network:
world:
embed_dim: 256
num_layers: 3
num_heads: 1
reward:
class: EnsembleMLP
ensemble_size: 1
hidden_dims: [256, 256]
reward_act: sigmoid
actor:
class: SquashedGaussianActor
hidden_dims: [256, 256]
reparameterize: false
conditioned_logstd: false
logstd_min: -5
logstd_max: 2
critic:
class: Critic
ensemble_size: 2
hidden_dims: [256, 256]

rm_dataset:
- class: D4RLOfflineDataset
env: hopper-medium-replay-v2
batch_size: 64
mode: trajectory
segment_length: 100
padding_mode: none
- class: IPLComparisonOfflineDataset
env: hopper-medium-replay-v2
batch_size: 8
segment_length: null
mode: human
rm_dataloader:
num_workers: 2
batch_size: null

rl_dataset:
- class: D4RLOfflineDataset
env: hopper-medium-replay-v2
batch_size: 256
mode: transition
reward_normalize: true
rl_dataloader:
num_workers: 2
batch_size: null

trainer:
env_freq: null
rm_label: true
rm_steps: 60000
rl_steps: 1000000
log_freq: 500
profile_freq: 500
eval_freq: 5000

rm_eval:
function: eval_world_model_and_reward_model
eval_dataset_kwargs:
class: IPLComparisonOfflineDataset
env: hopper-medium-replay-v2
batch_size: 32
mode: human
eval: false
rl_eval:
function: eval_offline
num_ep: 10
deterministic: true

schedulers:
actor:
class: CosineAnnealingLR
T_max: 1000000

processor: null
104 changes: 104 additions & 0 deletions scripts/configs/hprior_awac/metaworld.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
algorithm:
class: Hindsight_PRIOR_AWAC
beta: 0.3333
max_exp_clip: 100.0
reward_reg: 0.0
max_seq_len: 25
world_steps: 10000
prior_coef: 1000
rm_label: true

checkpoint: null
seed: 0
name: default
debug: false
device: null
wandb:
activate: false
entity: null
project: null

env: button-press-v2
env_kwargs:
env_wrapper:
env_wrapper_kwargs:

optim:
default:
class: Adam
lr: 0.0003

network:
world:
embed_dim: 256
num_layers: 3
num_heads: 1
reward:
class: EnsembleMLP
ensemble_size: 1
hidden_dims: [256, 256, 256]
reward_act: identity
ortho_init: true
actor:
class: SquashedGaussianActor
hidden_dims: [256, 256, 256]
reparameterize: false
conditioned_logstd: false
logstd_min: -5
logstd_max: 2
critic:
class: Critic
ensemble_size: 2
hidden_dims: [256, 256, 256]

rm_dataset:
- class: MetaworldOfflineDataset
env: button-press-v2
batch_size: 16
capacity: 5000
- class: MetaworldComparisonDataset
env: button-press-v2
batch_size: 16
segment_length: null
capacity: 500
rm_dataloader:
num_workers: 2
batch_size: null

rl_dataset:
- class: MetaworldOfflineDataset
env: button-press-v2
batch_size: 16
capacity: 5000
rl_dataloader:
num_workers: 2
batch_size: null

trainer:
env_freq: null
rm_label: true
rm_steps: 60000
rl_steps: 500000
log_freq: 500
profile_freq: 500
eval_freq: 10000

rm_eval:
function: eval_world_model_and_reward_model
eval_dataset_kwargs:
class: MetaworldComparisonDataset
env: button-press-v2
batch_size: 32
segment_length: null
capacity: 500
rl_eval:
function: eval_offline
num_ep: 20
deterministic: true

schedulers:
actor:
class: CosineAnnealingLR
T_max: 500000

processor: null
Loading