[RLlib] Benchmark and regression test yaml cleanup and restructuring. (#8414)

This commit is contained in:
Sven Mika 2020-05-26 11:10:27 +02:00 committed by GitHub
parent ae2e1f0883
commit baa053496a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
89 changed files with 614 additions and 584 deletions

View file

@ -37,70 +37,120 @@
# Tag: learning_tests
#
# This will test all yaml files (via `rllib train`)
# inside rllib/tuned_examples/regression_tests for actual learning success.
# inside rllib/tuned_examples/[algo-name] for actual learning success.
# --------------------------------------------------------------------
# A2C/A3C
py_test(
name = "run_regression_tests_cartpole_pg_a3c_tf",
name = "regression_test_a2c_cartpole_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-pg-tf.yaml",
"tuned_examples/regression_tests/cartpole-a3c-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
args = ["--yaml-dir=tuned_examples/a3c"]
)
py_test(
name = "regression_test_a2c_cartpole_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
)
py_test(
name = "regression_test_a3c_cartpole_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
args = ["--yaml-dir=tuned_examples/a3c"]
)
py_test(
name = "regression_test_a3c_cartpole_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
)
# APPO
py_test(
name = "run_regression_tests_cartpole_appo_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-appo-tf.yaml",
"tuned_examples/ppo/cartpole-appo.yaml",
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
args = ["--yaml-dir=tuned_examples/ppo"]
)
py_test(
name = "run_regression_tests_cartpole_appo_vtrace_tf",
name = "run_regression_tests_cartpole_appo_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-appo-vtrace-tf.yaml",
"tuned_examples/ppo/cartpole-appo.yaml",
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_es_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-es-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
)
# ARS
py_test(
name = "run_regression_tests_cartpole_ars_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-ars-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/ars/cartpole-ars.yaml"],
args = ["--yaml-dir=tuned_examples/ars"]
)
py_test(
name = "run_regression_tests_cartpole_ars_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ars/cartpole-ars.yaml"],
args = ["--yaml-dir=tuned_examples/ars", "--torch"]
)
# DDPG
py_test(
name = "run_regression_tests_pendulum_ddpg_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
args = ["--yaml-dir=tuned_examples/ddpg"]
)
py_test(
name = "run_regression_tests_pendulum_ddpg_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
args = ["--torch", "--yaml-dir=tuned_examples/ddpg"]
)
# DQN/Simple-Q
py_test(
name = "run_regression_tests_cartpole_dqn_tf",
main = "tests/run_regression_tests.py",
@ -108,95 +158,11 @@ py_test(
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-simpleq-tf.yaml",
"tuned_examples/regression_tests/cartpole-dqn-tf.yaml",
"tuned_examples/regression_tests/cartpole-dqn-param-noise-tf.yaml",
"tuned_examples/dqn/cartpole-simpleq.yaml",
"tuned_examples/dqn/cartpole-dqn.yaml",
"tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_impala_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-impala-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_sac_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-sac-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_ppo_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-ppo-tf.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_a2c_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-a2c-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_appo_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-appo-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_appo_vtrace_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-appo-vtrace-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
)
py_test(
name = "run_regression_tests_cartpole_ars_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-ars-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
args = ["--yaml-dir=tuned_examples/dqn"]
)
py_test(
@ -206,91 +172,177 @@ py_test(
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-dqn-param-noise-torch.yaml"
"tuned_examples/dqn/cartpole-simpleq.yaml",
"tuned_examples/dqn/cartpole-dqn.yaml",
"tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
],
args = ["BAZEL", "tuned_examples/regression_tests"]
args = ["--yaml-dir=tuned_examples/dqn", "--torch"]
)
# ES
py_test(
name = "run_regression_tests_cartpole_es_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/es/cartpole-es.yaml"],
args = ["--yaml-dir=tuned_examples/es"]
)
py_test(
name = "run_regression_tests_cartpole_es_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-es-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/es/cartpole-es.yaml"],
args = ["--yaml-dir=tuned_examples/es", "--torch"]
)
# IMPALA
py_test(
name = "run_regression_tests_cartpole_impala_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/impala/cartpole-impala.yaml"],
args = ["--yaml-dir=tuned_examples/impala"]
)
py_test(
name = "run_regression_tests_cartpole_impala_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-impala-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/impala/cartpole-impala.yaml"],
args = ["--yaml-dir=tuned_examples/impala", "--torch"]
)
# PG
py_test(
name = "run_regression_tests_cartpole_pg_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/pg/cartpole-pg.yaml"],
args = ["--yaml-dir=tuned_examples/pg"]
)
py_test(
name = "run_regression_tests_cartpole_pg_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-pg-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/pg/cartpole-pg.yaml"],
args = ["--yaml-dir=tuned_examples/pg", "--torch"]
)
# PPO
py_test(
name = "run_regression_tests_cartpole_ppo_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)
py_test(
name = "run_regression_tests_cartpole_ppo_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
)
py_test(
name = "run_regression_tests_pendulum_ppo_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-ppo-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)
py_test(
name = "run_regression_tests_pendulum_ppo_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
args = ["--torch", "--yaml-dir=tuned_examples/ppo"]
)
# SAC
py_test(
name = "run_regression_tests_cartpole_sac_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_cartpole"],
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/sac/cartpole-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac"]
)
py_test(
name = "run_regression_tests_cartpole_sac_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_cartpole"],
size = "large",
size = "medium",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/regression_tests/cartpole-sac-torch.yaml"
],
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/sac/cartpole-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
)
py_test(
name = "run_regression_tests_pendulum_tf",
name = "run_regression_tests_pendulum_sac_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_pendulum"],
size = "enormous", # = 60min timeout
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = glob(["tuned_examples/regression_tests/pendulum-*-tf.yaml"]),
# Pass `BAZEL` option and the path to look for yaml regression files.
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/sac/pendulum-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac"]
)
py_test(
name = "run_regression_tests_pendulum_torch",
name = "run_regression_tests_pendulum_sac_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_pendulum"],
size = "enormous", # = 60min timeout
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = glob(["tuned_examples/regression_tests/pendulum-*-torch.yaml"]),
# Pass `BAZEL` option and the path to look for yaml regression files.
args = ["BAZEL", "tuned_examples/regression_tests"]
data = ["tuned_examples/sac/pendulum-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
)
# TD3
py_test(
name = "run_regression_tests_pendulum_td3_tf",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_tf", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
args = ["--yaml-dir=tuned_examples/ddpg"]
)
py_test(
name = "run_regression_tests_pendulum_td3_torch",
main = "tests/run_regression_tests.py",
tags = ["learning_tests_torch", "learning_tests_pendulum"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
args = ["--yaml-dir=tuned_examples/ddpg", "--torch"]
)
# --------------------------------------------------------------------

View file

@ -200,7 +200,8 @@ def build_ddpg_stats(policy, batch):
"mean_q": torch.mean(policy.q_t),
"max_q": torch.max(policy.q_t),
"min_q": torch.min(policy.q_t),
"td_error": policy.td_error
"mean_td_error": torch.mean(policy.td_error),
"td_error": policy.td_error,
}
return stats

View file

@ -23,7 +23,9 @@ class TestDDPG(unittest.TestCase):
"""Test whether a DDPGTrainer can be built with both frameworks."""
config = ddpg.DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
config["num_envs_per_worker"] = 2 # Run locally.
config["num_envs_per_worker"] = 2
config["learning_starts"] = 0
config["exploration_config"]["random_timesteps"] = 100
num_iterations = 2

View file

@ -9,13 +9,15 @@
# name = "run_regression_tests",
# main = "tests/run_regression_tests.py",
# tags = ["learning_tests"],
# size = "enormous", # = 60min timeout
# size = "medium", # 5min timeout
# srcs = ["tests/run_regression_tests.py"],
# data = glob(["tuned_examples/regression_tests/*.yaml"]),
# Pass `BAZEL` option and the path to look for yaml regression files.
# # Pass `BAZEL` option and the path to look for yaml regression files.
# args = ["BAZEL", "tuned_examples/regression_tests"]
# )
import argparse
import os
from pathlib import Path
import sys
import yaml
@ -24,30 +26,51 @@ import ray
from ray.tune import run_experiments
from ray.rllib import _register_all
if __name__ == "__main__":
# Bazel regression test mode: Get path to look for yaml files from argv[2].
if sys.argv[1] == "BAZEL":
# Get the path to use.
rllib_dir = Path(__file__).parent.parent
print("rllib dir={}".format(rllib_dir))
yaml_files = rllib_dir.rglob(sys.argv[2] + "/*.yaml")
yaml_files = sorted(
map(lambda path: str(path.absolute()), yaml_files), reverse=True)
# Normal mode: Get yaml files to run from command line.
else:
yaml_files = sys.argv[1:]
parser = argparse.ArgumentParser()
parser.add_argument(
"--torch",
action="store_true",
help="Runs all tests with PyTorch enabled.")
parser.add_argument(
"--yaml-dir",
type=str,
help="The directory in which to find all yamls to test.")
print("Will run the following regression files:")
if __name__ == "__main__":
args = parser.parse_args()
# Bazel regression test mode: Get path to look for yaml files from argv[2].
# Get the path or single file to use.
rllib_dir = Path(__file__).parent.parent
print("rllib dir={}".format(rllib_dir))
if not os.path.isdir(os.path.join(rllib_dir, args.yaml_dir)):
raise ValueError("yaml-dir ({}) not found!".format(args.yaml_dir))
yaml_files = rllib_dir.rglob(args.yaml_dir + "/*.yaml")
yaml_files = sorted(
map(lambda path: str(path.absolute()), yaml_files), reverse=True)
print("Will run the following regression tests:")
for yaml_file in yaml_files:
print("->", yaml_file)
# Loop through all collected files.
for yaml_file in yaml_files:
experiments = yaml.load(open(yaml_file).read())
assert len(experiments) == 1,\
"Error, can only run a single experiment per yaml file!"
print("== Test config ==")
print(yaml.dump(experiments))
# Add torch option to exp configs.
for exp in experiments.values():
if args.torch:
exp["config"]["use_pytorch"] = True
# Try running each test 3 times and make sure it reaches the given
# reward.
passed = False
for i in range(3):
try:

View file

@ -9,6 +9,7 @@ atari-a2c:
- SpaceInvadersNoFrameskip-v4
run: A2C
config:
use_pytorch: false # <- switch on/off torch
rollout_fragment_length: 20
clip_rewards: True
num_workers: 5

View file

@ -1,10 +1,11 @@
cartpole-a2c-microbatch-tf:
cartpole-a2c-microbatch:
env: CartPole-v0
run: A2C
stop:
episode_reward_mean: 100
episode_reward_mean: 150
timesteps_total: 100000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 1
gamma: 0.95

View file

@ -0,0 +1,11 @@
cartpole-a2c:
env: CartPole-v0
run: A2C
stop:
episode_reward_mean: 150
timesteps_total: 500000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 0
lr: 0.001

View file

@ -1,10 +1,11 @@
cartpole-a3c-tf:
cartpole-a3c:
env: CartPole-v0
run: A3C
stop:
episode_reward_mean: 100
timesteps_total: 100000
episode_reward_mean: 150
timesteps_total: 200000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 1
gamma: 0.95

View file

@ -4,9 +4,10 @@ pong-a3c:
env: PongDeterministic-v4
run: A3C
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 16
rollout_fragment_length: 20
use_pytorch: false
vf_loss_coeff: 0.5
entropy_coeff: 0.01
gamma: 0.99

View file

@ -1,11 +1,12 @@
cartpole-ars-torch:
cartpole-ars:
env: CartPole-v0
run: ARS
stop:
episode_reward_mean: 150
timesteps_total: 500000
config:
use_pytorch: true
# Works for both torch and tf.
use_pytorch: false
noise_stdev: 0.02
num_rollouts: 50
rollouts_used: 25
@ -13,5 +14,3 @@ cartpole-ars-torch:
sgd_stepsize: 0.01
noise_size: 25000000
eval_prob: 0.5
model:
fcnet_hiddens: [64, 64]

View file

@ -3,6 +3,8 @@ swimmer-ars:
env: Swimmer-v2
run: ARS
config:
# Works for both torch and tf.
use_pytorch: false
noise_stdev: 0.01
num_rollouts: 1
rollouts_used: 1

View file

@ -1,13 +0,0 @@
# To generate training data, first run:
# $ ./train.py --run=PPO --env=CartPole-v0 \
# --stop='{"timesteps_total": 50000}' \
# --config='{"use_pytorch": true, "output": "/tmp/out", "batch_mode": "complete_episodes"}'
cartpole-marwil-torch:
env: CartPole-v0
run: MARWIL
stop:
timesteps_total: 500000
config:
beta:
grid_search: [0, 1] # compare IL (beta=0) vs MARWIL
input: /tmp/out

View file

@ -0,0 +1,169 @@
"""
This script automates cleaning up a benchmark/experiment run of some algo
against some config (with possibly more than one tune trial,
e.g. torch=grid_search([True, False])).
Run `python cleanup_experiment.py --help` for more information.
Use on an input directory with trial contents e.g.:
..
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_10-17-54topr3h9k
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_13-59-35dqaetxnf
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_17-21-28tbhedw72
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_10-17-54lv20cgn_
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_13-59-35kwzhax_y
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_17-21-28a5j0s7za
Then run:
>> python cleanup_experiment.py --experiment-dir [parent dir w/ trial sub-dirs]
>> --output-dir [your out dir] --results-filter dumb_col_2,superfluous_col3
>> --results-max-size [max results file size in kb before(!) zipping]
The script will create one output sub-dir for each trial and only copy
the configuration and the csv results (filtered and every nth row removed
based on the given args).
"""
import argparse
import json
import os
import re
import shutil
import yaml
parser = argparse.ArgumentParser()
parser.add_argument(
"--experiment-dir",
type=str,
help="Experiment dir in which all sub-runs (seeds) are "
"located (as sub-dirs). Each sub0-run dir must contain the files: "
"params.json and progress.csv.")
parser.add_argument(
"--output-dir",
type=str,
help="The output dir, in which the cleaned up output will be placed.")
parser.add_argument(
"--results-filter",
type=str,
help="comma-separated list of csv fields to exclude.",
default="experiment_id,pid,hostname,node_ip,trial_id,hist_stats/episode_"
"reward,hist_stats/episode_lengths,experiment_tag")
parser.add_argument(
"--results-max-size",
type=int,
help="the max. size of the final results.csv file (in kb). Will erase "
"every nth line in the original input to reach that goal. "
"Use 0 for no limit (default=100).",
default=100)
def process_single_run(in_dir, out_dir):
exp_dir = os.listdir(in_dir)
# Make sure trials dir is ok.
assert "params.json" in exp_dir and "progress.csv" in exp_dir, \
"params.json or progress.csv not found in {}!".format(in_dir)
os.makedirs(out_dir, exist_ok=True)
for file in exp_dir:
absfile = os.path.join(in_dir, file)
# Config file -> Convert to yaml and move to output dir.
if file == "params.json":
assert os.path.isfile(absfile), "{} not a file!".format(file)
with open(absfile) as fp:
contents = json.load(fp)
with open(os.path.join(out_dir, "config.yaml"), "w") as fp:
yaml.dump(contents, fp)
# Progress csv file -> Filter out some columns, cut, and write to
# output_dir.
elif file == "progress.csv":
assert os.path.isfile(absfile), "{} not a file!".format(file)
col_idx_to_filter = []
with open(absfile) as fp:
# Get column names.
col_names_orig = fp.readline().strip().split(",")
# Split by comma (abiding to quotes), filter out
# unwanted columns, then write to disk.
cols_to_filter = args.results_filter.split(",")
for i, c in enumerate(col_names_orig):
if c in cols_to_filter:
col_idx_to_filter.insert(0, i)
col_names = col_names_orig.copy()
for idx in col_idx_to_filter:
col_names.pop(idx)
absfile_out = os.path.join(out_dir, "progress.csv")
with open(absfile_out, "w") as out_fp:
print(",".join(col_names), file=out_fp)
while True:
line = fp.readline().strip()
if not line:
break
line = re.sub(
"(,{2,})",
lambda m: ",None" * (len(m.group()) - 1) + ",",
line)
cols = re.findall('".+?"|[^,]+', line)
if len(cols) != len(col_names_orig):
continue
for idx in col_idx_to_filter:
cols.pop(idx)
print(",".join(cols), file=out_fp)
# Reduce the size of the output file if necessary.
out_size = os.path.getsize(absfile_out)
max_size = args.results_max_size * 1024
if 0 < max_size < out_size:
# Figure out roughly every which line we have to drop.
ratio = out_size / max_size
# If ratio > 2.0, we'll have to keep only every nth line.
if ratio > 2.0:
nth = out_size // max_size
os.system("awk 'NR==1||NR%{}==0' {} > {}.new".format(
nth, absfile_out, absfile_out))
# If ratio < 2.0 (>1.0), we'll have to drop every nth line.
else:
nth = out_size // (out_size - max_size)
os.system("awk 'NR==1||NR%{}!=0' {} > {}.new".format(
nth, absfile_out, absfile_out))
os.remove(absfile_out)
os.rename(absfile_out + ".new", absfile_out)
# Zip progress.csv into results.zip.
zip_file = os.path.join(out_dir, "results.zip")
try:
os.remove(zip_file)
except FileNotFoundError:
pass
os.system("zip -j {} {}".format(
zip_file, os.path.join(out_dir, "progress.csv")))
os.remove(os.path.join(out_dir, "progress.csv"))
# TBX events file -> Move as is.
elif re.search("^(events\\.out\\.|params\\.pkl)", file):
assert os.path.isfile(absfile), "{} not a file!".format(file)
shutil.copyfile(absfile, os.path.join(out_dir, file))
if __name__ == "__main__":
args = parser.parse_args()
exp_dir = os.listdir(args.experiment_dir)
# Loop through all sub-directories.
for i, sub_run in enumerate(sorted(exp_dir)):
abspath = os.path.join(args.experiment_dir, sub_run)
# This is a seed run.
if os.path.isdir(abspath) and \
re.search("^(\\w+?)_(\\w+?-v\\d+)(_\\d+)", sub_run):
# Create meaningful output dir name:
# [algo]_[env]_[trial #]_[trial-config]_[date YYYY-MM-DD].
cleaned_up_out = re.sub(
"^(\\w+?)_(\\w+?-v\\d+)(_\\d+)(_.+)?(_\\d{4}-\\d{2}-\\d{2})"
"_\\d{2}-\\d{2}-\\w+", "{:02}_\\1_\\2\\4\\5".format(i),
sub_run)
# Remove superflous `env=` specifier (anv always included in name).
cleaned_up_out = re.sub("^(.+)env=\\w+?-v\\d+,?(.+)", "\\1\\2",
cleaned_up_out)
out_path = os.path.join(args.output_dir, cleaned_up_out)
process_single_run(abspath, out_path)
# Done.
print("done")

View file

@ -0,0 +1,5 @@
# TODO(sven):
# Add a simple script that takes n csv input files and generates plot(s)
# from these with: x-axis=ts OR wall-time; y-axis=any metric(s) (up to 2).
# ability to merge any m csv files (e.g. tf vs torch; or n seeds) together
# in one plot.

View file

@ -6,6 +6,7 @@ halfcheetah-ddpg:
episode_reward_mean: 2000
time_total_s: 5400 # 90 minutes
config:
use_pytorch: false # <- switch on/off torch
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]

View file

@ -9,6 +9,8 @@ invertedpendulum-td3:
time_total_s: 900 # 15 minutes
timesteps_total: 1000000
config:
# Works for both torch and tf.
use_pytorch: false
# === Model ===
actor_hiddens: [32, 32]
critic_hiddens: [32, 32]

View file

@ -5,6 +5,8 @@ mountaincarcontinuous-apex-ddpg:
stop:
episode_reward_mean: 90
config:
# Works for both torch and tf.
use_pytorch: false
clip_rewards: False
num_workers: 16
exploration_config:

View file

@ -6,6 +6,8 @@ mountaincarcontinuous-ddpg:
episode_reward_mean: 90
time_total_s: 600 # 10 minutes
config:
# Works for both torch and tf.
use_pytorch: false
# === Model ===
actor_hiddens: [32, 64]
critic_hiddens: [64, 64]

View file

@ -15,6 +15,8 @@ mujoco-td3:
stop:
timesteps_total: 1000000
config:
# Works for both torch and tf.
use_pytorch: false
# === Exploration ===
learning_starts: 10000
exploration_config:

View file

@ -5,6 +5,8 @@ pendulum-apex-ddpg:
stop:
episode_reward_mean: -160
config:
# Works for both torch and tf.
use_pytorch: false
use_huber: True
clip_rewards: False
num_workers: 16

View file

@ -1,11 +1,13 @@
# This configuration can expect to reach -160 reward in 10k-20k timesteps
# This configuration can expect to reach -160 reward in 10k-20k timesteps.
pendulum-ddpg:
env: Pendulum-v0
run: DDPG
stop:
episode_reward_mean: -160
timesteps_total: 100000
episode_reward_mean: -900
timesteps_total: 20000
config:
# Works for both torch and tf.
use_pytorch: false
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
@ -18,7 +20,7 @@ pendulum-ddpg:
exploration_config:
type: "OrnsteinUhlenbeckNoise"
scale_timesteps: 10000
initial_scale: 1.0,
initial_scale: 1.0
final_scale: 0.02
ou_base_scale: 0.1
ou_theta: 0.15

View file

@ -1,20 +1,20 @@
# This configuration can expect to reach -160 reward in 10k-20k timesteps
pendulum-ddpg:
pendulum-td3:
env: Pendulum-v0
run: TD3
stop:
episode_reward_mean: -130
time_total_s: 900 # 10 minutes
episode_reward_mean: -900
timesteps_total: 100000
config:
# Works for both torch and tf.
use_pytorch: false
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
# === Exploration ===
learning_starts: 5000
exploration_config:
random_timesteps: 5000
# === Evaluation ===
evaluation_interval: 1
evaluation_num_episodes: 5

View file

@ -8,6 +8,7 @@ apex:
- SpaceInvadersNoFrameskip-v4
run: APEX
config:
use_pytorch: false # <- switch on/off torch
double_q: false
dueling: false
num_atoms: 1

View file

@ -9,6 +9,7 @@ atari-basic-dqn:
- SpaceInvadersNoFrameskip-v4
run: DQN
config:
use_pytorch: false # <- switch on/off torch
double_q: false
dueling: false
num_atoms: 1

View file

@ -9,6 +9,7 @@ dueling-ddqn:
- SpaceInvadersNoFrameskip-v4
run: DQN
config:
use_pytorch: false # <- switch on/off torch
double_q: true
dueling: true
num_atoms: 1

View file

@ -1,10 +1,11 @@
cartpole-dqn-tf-w-param-noise:
cartpole-dqn-w-param-noise:
env: CartPole-v0
run: DQN
stop:
episode_reward_mean: 150
timesteps_total: 300000
config:
# Works for both torch and tf.
use_pytorch: false
exploration_config:
type: ParameterNoise

View file

@ -1,10 +1,11 @@
cartpole-dqn-tf:
cartpole-dqn:
env: CartPole-v0
run: DQN
stop:
episode_reward_mean: 150
timesteps_total: 50000
config:
# Works for both torch and tf.
use_pytorch: false
n_step: 3
gamma: 0.95

View file

@ -1,8 +1,9 @@
cartpole-dqn-tf:
cartpole-dqn:
env: CartPole-v0
run: SimpleQ
stop:
episode_reward_mean: 150
timesteps_total: 50000
config:
# Works for both torch and tf.
use_pytorch: false

View file

@ -6,6 +6,7 @@ pong-apex:
env: PongNoFrameskip-v4
run: APEX
config:
use_pytorch: false
target_network_update_freq: 20000
num_workers: 4
num_envs_per_worker: 8

View file

@ -6,6 +6,8 @@ pong-deterministic-dqn:
episode_reward_mean: 20
time_total_s: 7200
config:
# Works for both torch and tf.
use_pytorch: false
num_gpus: 1
gamma: 0.99
lr: .0001

View file

@ -1,10 +1,11 @@
cartpole-es-tf:
cartpole-es:
env: CartPole-v0
run: ES
stop:
episode_reward_mean: 150
timesteps_total: 500000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 2
noise_size: 25000000

View file

@ -0,0 +1,9 @@
humanoid-v2-es:
env: Humanoid-v2
run: ES
stop:
episode_reward_mean: 6000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 100

View file

@ -1,7 +0,0 @@
humanoid-es:
env: Humanoid-v1
run: ES
stop:
episode_reward_mean: 6000
config:
num_workers: 100

View file

@ -5,5 +5,6 @@ cartpole-impala-tf:
episode_reward_mean: 150
timesteps_total: 500000
config:
# Works for both torch and tf.
use_pytorch: false
num_gpus: 0

View file

@ -0,0 +1,6 @@
pendulum-impala-tf:
env: Pendulum-v0
run: IMPALA
stop:
episode_reward_mean: -700
timesteps_total: 500000

View file

@ -2,12 +2,13 @@
# $ ./train.py --run=PPO --env=CartPole-v0 \
# --stop='{"timesteps_total": 50000}' \
# --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
cartpole-marwil-tf:
cartpole-marwil:
env: CartPole-v0
run: MARWIL
stop:
timesteps_total: 500000
config:
use_pytorch: false # <- switch on/off torch
beta:
grid_search: [0, 1] # compare IL (beta=0) vs MARWIL
input: /tmp/out

View file

@ -1,21 +0,0 @@
pendulum-appo-vtrace-torch:
env: Pendulum-v0
run: APPO
stop:
episode_reward_mean: -1000 # just check it learns a bit
timesteps_total: 500000
config:
use_pytorch: true
vtrace: true
num_gpus: 0
num_workers: 1
lambda: 0.1
gamma: 0.95
lr: 0.0003
train_batch_size: 100
minibatch_buffer_size: 16
num_sgd_iter: 10
model:
fcnet_hiddens: [256, 256]
batch_mode: truncate_episodes
observation_filter: MeanStdFilter

View file

@ -1,18 +0,0 @@
# can expect improvement to -140 reward in ~300-500k timesteps
pendulum-ppo:
env: Pendulum-v0
run: PPO
config:
train_batch_size: 2048
vf_clip_param: 10.0
num_workers: 0
num_envs_per_worker: 10
lambda: 0.1
gamma: 0.95
lr: 0.0003
sgd_minibatch_size: 64
num_sgd_iter: 10
model:
fcnet_hiddens: [64, 64]
batch_mode: complete_episodes
observation_filter: MeanStdFilter

View file

@ -1,9 +1,10 @@
cartpole-pg-torch:
cartpole-pg:
env: CartPole-v0
run: PG
stop:
episode_reward_mean: 150
timesteps_total: 100000
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 0
use_pytorch: true

View file

@ -1,21 +0,0 @@
pong-a3c-pytorch-cnn:
env: PongDeterministic-v4
run: A3C
config:
num_workers: 16
rollout_fragment_length: 20
use_pytorch: true
vf_loss_coeff: 0.5
entropy_coeff: 0.01
gamma: 0.99
grad_clip: 40.0
lambda: 1.0
lr: 0.0001
observation_filter: NoFilter
model:
use_lstm: false
dim: 84
grayscale: true
zero_mean: false
optimizer:
grads_per_step: 1000

View file

@ -7,6 +7,7 @@ atari-ddppo:
- BreakoutNoFrameskip-v4
run: DDPPO
config:
use_pytorch: true # DDPPO only supports PyTorch so far
# Worker config: 10 workers, each of which requires a GPU.
num_workers: 10
num_gpus_per_worker: 1

View file

@ -9,6 +9,7 @@ atari-ppo:
- SpaceInvadersNoFrameskip-v4
run: PPO
config:
use_pytorch: false # <- switch on/off torch
lambda: 0.95
kl_coeff: 0.5
clip_rewards: True

View file

@ -1,10 +1,11 @@
cartpole-appo-vtrace-tf:
cartpole-appo-vtrace:
env: CartPole-v0
run: APPO
stop:
episode_reward_mean: 150
timesteps_total: 200000
config:
# Works for both torch and tf.
use_pytorch: false
rollout_fragment_length: 10
train_batch_size: 10

View file

@ -1,10 +1,11 @@
cartpole-appo-tf:
cartpole-appo:
env: CartPole-v0
run: APPO
stop:
episode_reward_mean: 150
timesteps_total: 200000
config:
# Works for both torch and tf.
use_pytorch: false
rollout_fragment_length: 10
train_batch_size: 10

View file

@ -1,9 +1,8 @@
cartpole-ddppo-torch:
cartpole-ddppo:
env: CartPole-v0
run: DDPPO
stop:
episode_reward_mean: 100
episode_reward_mean: 150
timesteps_total: 100000
config:
use_pytorch: true
num_gpus_per_worker: 0

View file

@ -5,6 +5,7 @@ cartpole-ppo:
episode_reward_mean: 200
time_total_s: 180
config:
use_pytorch: false # <- switch on/off torch
num_workers: 2
num_sgd_iter:
grid_search: [1, 4]

View file

@ -6,6 +6,8 @@ cartpole-ppo:
episode_reward_mean: 200
time_total_s: 180
config:
# Works for both torch and tf.
use_pytorch: false
num_workers: 1
num_sgd_iter:
grid_search: [1, 4]

View file

@ -1,11 +1,12 @@
cartpole-ppo-torch:
cartpole-ppo:
env: CartPole-v0
run: PPO
stop:
episode_reward_mean: 150
timesteps_total: 100000
config:
use_pytorch: true
# Works for both torch and tf.
use_pytorch: false
gamma: 0.99
lr: 0.0003
num_workers: 1

View file

@ -1,11 +1,12 @@
# This can reach 9k reward in 2 hours on a Titan XP GPU
# This can reach 9k reward in 2 hours on a Titan XP GPU
# with 16 workers and 8 envs per worker.
halfcheetah-appo:
env: HalfCheetah-v2
run: APPO
stop:
time_total_s: 10800
time_total_s: 10800
config:
use_pytorch: false # <- switch on/off torch
vtrace: True
gamma: 0.99
lambda: 0.95
@ -30,6 +31,6 @@ halfcheetah-appo:
batch_mode: truncate_episodes
use_kl_loss: True
kl_coeff: 1.0
kl_target: 0.04
kl_target: 0.04
observation_filter: MeanStdFilter

View file

@ -1,23 +1,24 @@
halfcheetah-ppo:
env: HalfCheetah-v2
run: PPO
stop:
episode_reward_mean: 9800
time_total_s: 10800
config:
gamma: 0.99
lambda: 0.95
kl_coeff: 1.0
num_sgd_iter: 32
lr: .0003
vf_loss_coeff: 0.5
clip_param: 0.2
sgd_minibatch_size: 4096
train_batch_size: 65536
num_workers: 16
num_gpus: 1
grad_clip: 0.5
num_envs_per_worker:
grid_search: [16, 32]
batch_mode: truncate_episodes
observation_filter: MeanStdFilter
halfcheetah-ppo:
env: HalfCheetah-v2
run: PPO
stop:
episode_reward_mean: 9800
time_total_s: 10800
config:
use_pytorch: false # <- switch on/off torch
gamma: 0.99
lambda: 0.95
kl_coeff: 1.0
num_sgd_iter: 32
lr: .0003
vf_loss_coeff: 0.5
clip_param: 0.2
sgd_minibatch_size: 4096
train_batch_size: 65536
num_workers: 16
num_gpus: 1
grad_clip: 0.5
num_envs_per_worker:
grid_search: [16, 32]
batch_mode: truncate_episodes
observation_filter: MeanStdFilter

View file

@ -2,6 +2,8 @@ hopper-ppo:
env: Hopper-v1
run: PPO
config:
# Works for both torch and tf.
use_pytorch: false
gamma: 0.995
kl_coeff: 1.0
num_sgd_iter: 20

View file

@ -4,6 +4,8 @@ humanoid-ppo-gae:
stop:
episode_reward_mean: 6000
config:
# Works for both torch and tf.
use_pytorch: false
gamma: 0.995
lambda: 0.95
clip_param: 0.2

View file

@ -4,6 +4,8 @@ humanoid-ppo:
stop:
episode_reward_mean: 6000
config:
# Works for both torch and tf.
use_pytorch: false
gamma: 0.995
kl_coeff: 1.0
num_sgd_iter: 20

View file

@ -1,10 +1,11 @@
pendulum-appo-vtrace-tf:
pendulum-appo-vtrace:
env: Pendulum-v0
run: APPO
stop:
episode_reward_mean: -1000 # just check it learns a bit
timesteps_total: 500000
config:
# Works for both torch and tf.
use_pytorch: false
vtrace: true
num_gpus: 0

View file

@ -1,10 +1,12 @@
pendulum-ppo-tf:
# Can expect improvement to -140 reward in ~300-500k timesteps.
pendulum-ppo:
env: Pendulum-v0
run: PPO
stop:
episode_reward_mean: -500
timesteps_total: 400000
config:
# Works for both torch and tf.
use_pytorch: false
train_batch_size: 2048
vf_clip_param: 10.0

View file

@ -1,29 +1,31 @@
# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
# with 32 workers and 8 envs per worker. IMPALA, when ran with
# similar configurations, solved Pong in 10-12 minutes.
# APPO can also solve Pong in 2.5 million timesteps, which is
# 2x more efficient than that of IMPALA.
pong-appo:
env: PongNoFrameskip-v4
run: APPO
stop:
episode_reward_mean: 18.0
timesteps_total: 5000000
config:
vtrace: True
use_kl_loss: False
rollout_fragment_length: 50
train_batch_size: 750
num_workers: 32
broadcast_interval: 1
max_sample_requests_in_flight_per_worker: 1
num_data_loader_buffers: 1
num_envs_per_worker: 8
minibatch_buffer_size: 4
num_sgd_iter: 2
vf_loss_coeff: 1.0
clip_param: 0.3
num_gpus: 1
grad_clip: 10
model:
dim: 42
# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
# with 32 workers and 8 envs per worker. IMPALA, when ran with
# similar configurations, solved Pong in 10-12 minutes.
# APPO can also solve Pong in 2.5 million timesteps, which is
# 2x more efficient than that of IMPALA.
pong-appo:
env: PongNoFrameskip-v4
run: APPO
stop:
episode_reward_mean: 18.0
timesteps_total: 5000000
config:
# Works for both torch and tf.
use_pytorch: false
vtrace: True
use_kl_loss: False
rollout_fragment_length: 50
train_batch_size: 750
num_workers: 32
broadcast_interval: 1
max_sample_requests_in_flight_per_worker: 1
num_data_loader_buffers: 1
num_envs_per_worker: 8
minibatch_buffer_size: 4
num_sgd_iter: 2
vf_loss_coeff: 1.0
clip_param: 0.3
num_gpus: 1
grad_clip: 10
model:
dim: 42

View file

@ -1,11 +1,13 @@
# On a single GPU, this achieves maximum reward in ~15-20 minutes.
#
# $ python train.py -f tuned_examples/pong-ppo.yaml
# $ python train.py -f tuned_configs/pong-ppo.yaml
#
pong-ppo:
env: PongNoFrameskip-v4
run: PPO
config:
# Works for both torch and tf.
use_pytorch: false
lambda: 0.95
kl_coeff: 0.5
clip_rewards: True

View file

@ -2,6 +2,8 @@ walker2d-v1-ppo:
env: Walker2d-v1
run: PPO
config:
# Works for both torch and tf.
use_pytorch: false
kl_coeff: 1.0
num_sgd_iter: 20
lr: .0001

View file

@ -1,9 +0,0 @@
cartpole-a2c-torch:
env: CartPole-v0
run: A2C
stop:
episode_reward_mean: 100
timesteps_total: 100000
config:
num_workers: 0
use_pytorch: true

View file

@ -1,14 +0,0 @@
cartpole-appo-torch:
env: CartPole-v0
run: APPO
stop:
episode_reward_mean: 150
timesteps_total: 200000
config:
use_pytorch: true
rollout_fragment_length: 10
train_batch_size: 10
num_envs_per_worker: 5
num_workers: 1
num_gpus: 0
vtrace: false

View file

@ -1,14 +0,0 @@
cartpole-appo-vtrace-torch:
env: CartPole-v0
run: APPO
stop:
episode_reward_mean: 150
timesteps_total: 200000
config:
use_pytorch: true
rollout_fragment_length: 10
train_batch_size: 10
num_envs_per_worker: 5
num_workers: 1
num_gpus: 0
vtrace: true

View file

@ -1,17 +0,0 @@
cartpole-ars-tf:
env: CartPole-v0
run: ARS
stop:
episode_reward_mean: 50
timesteps_total: 500000
config:
use_pytorch: false
noise_stdev: 0.02
num_rollouts: 50
rollouts_used: 25
num_workers: 2
sgd_stepsize: 0.01
noise_size: 25000000
eval_prob: 0.5
model:
fcnet_hiddens: [] # a linear policy

View file

@ -1,18 +0,0 @@
cartpole-dqn-torch-w-param-noise:
env: CartPole-v0
run: DQN
stop:
episode_reward_mean: 150
timesteps_total: 300000
config:
use_pytorch: true
exploration_config:
type: ParameterNoise
random_timesteps: 10000
initial_stddev: 1.0
batch_mode: complete_episodes
lr: 0.0008
num_workers: 0
model:
fcnet_hiddens: [32, 32]
fcnet_activation: tanh

View file

@ -1,10 +0,0 @@
cartpole-dqn-torch:
env: CartPole-v0
run: DQN
stop:
episode_reward_mean: 150
timesteps_total: 50000
config:
use_pytorch: true
n_step: 3
gamma: 0.95

View file

@ -1,11 +0,0 @@
cartpole-es-torch:
env: CartPole-v0
run: ES
stop:
episode_reward_mean: 150
timesteps_total: 500000
config:
use_pytorch: true
num_workers: 2
noise_size: 25000000
episodes_per_batch: 50

View file

@ -1,9 +0,0 @@
cartpole-impala-torch:
env: CartPole-v0
run: IMPALA
stop:
episode_reward_mean: 150
timesteps_total: 500000
config:
use_pytorch: true
num_gpus: 0

View file

@ -1,8 +0,0 @@
cartpole-pg-tf:
env: CartPole-v0
run: PG
stop:
episode_reward_mean: 100
timesteps_total: 100000
config:
num_workers: 0

View file

@ -1,17 +0,0 @@
cartpole-ppo-tf:
env: CartPole-v0
run: PPO
stop:
episode_reward_mean: 150
timesteps_total: 100000
config:
gamma: 0.99
lr: 0.0003
num_workers: 1
observation_filter: MeanStdFilter
num_sgd_iter: 6
vf_share_layers: true
vf_loss_coeff: 0.01
model:
fcnet_hiddens: [32]
fcnet_activation: linear

View file

@ -1,17 +0,0 @@
cartpole-sac-torch:
env: CartPole-v0
run: SAC
stop:
episode_reward_mean: 150
timesteps_total: 50000
config:
use_pytorch: true
gamma: 0.95
no_done_at_end: false
target_network_update_freq: 32
tau: 1.0
train_batch_size: 32
optimization:
actor_learning_rate: 0.005
critic_learning_rate: 0.005
entropy_learning_rate: 0.0001

View file

@ -1,8 +0,0 @@
cartpole-dqn-torch:
env: CartPole-v0
run: SimpleQ
stop:
episode_reward_mean: 150
timesteps_total: 50000
config:
use_pytorch: true

View file

@ -1,10 +0,0 @@
pendulum-ddpg-tf:
env: Pendulum-v0
run: DDPG
stop:
episode_reward_mean: -700
timesteps_total: 100000
config:
use_pytorch: false
use_huber: true
clip_rewards: false

View file

@ -1,10 +0,0 @@
pendulum-ddpg-torch:
env: Pendulum-v0
run: DDPG
stop:
episode_reward_mean: -700
timesteps_total: 100000
config:
use_pytorch: true
use_huber: true
clip_rewards: false

View file

@ -1,21 +0,0 @@
pendulum-ppo-torch:
env: Pendulum-v0
run: PPO
stop:
episode_reward_mean: -500
timesteps_total: 400000
config:
use_pytorch: true
train_batch_size: 2048
vf_clip_param: 10.0
num_workers: 0
num_envs_per_worker: 10
lambda: 0.1
gamma: 0.95
lr: 0.0003
sgd_minibatch_size: 64
num_sgd_iter: 10
model:
fcnet_hiddens: [64, 64]
batch_mode: complete_episodes
observation_filter: MeanStdFilter

View file

@ -1,13 +0,0 @@
pendulum-sac-tf:
env: Pendulum-v0
run: SAC
stop:
episode_reward_mean: -300 # note that evaluation perf is higher
timesteps_total: 10000
config:
use_pytorch: false
soft_horizon: true
clip_actions: false
normalize_actions: true
metrics_smoothing_episodes: 5
no_done_at_end: true

View file

@ -1,13 +0,0 @@
pendulum-sac-torch:
env: Pendulum-v0
run: SAC
stop:
episode_reward_mean: -300 # note that evaluation perf is higher
timesteps_total: 10000
config:
use_pytorch: true
soft_horizon: true
clip_actions: false
normalize_actions: true
metrics_smoothing_episodes: 5
no_done_at_end: true

View file

@ -1,8 +0,0 @@
pendulum-td3-tf:
env: Pendulum-v0
run: TD3
config:
use_pytorch: false
stop:
episode_reward_mean: -900
timesteps_total: 100000

View file

@ -10,9 +10,7 @@ atari-sac-tf-and-torch:
stop:
timesteps_total: 20000000
config:
# Works for both torch and tf.
use_pytorch:
grid_search: [false, true]
use_pytorch: false # <- switch on/off torch
gamma: 0.99
# state-preprocessor=Our default Atari Conv2D-net.
use_state_preprocessor: true

View file

@ -1,10 +1,11 @@
cartpole-sac-tf:
cartpole-sac:
env: CartPole-v0
run: SAC
stop:
episode_reward_mean: 150
timesteps_total: 50000
timesteps_total: 100000
config:
# Works for both torch and tf.
use_pytorch: false
gamma: 0.95
no_done_at_end: false

View file

@ -5,6 +5,7 @@ halfcheetah_sac:
stop:
episode_reward_mean: 9000
config:
use_pytorch: false # <- switch on/off torch
horizon: 1000
soft_horizon: false
Q_model:

View file

@ -8,6 +8,7 @@ mspacman-sac-tf:
episode_reward_mean: 800
timesteps_total: 100000
config:
# Works for both torch and tf.
use_pytorch: false
gamma: 0.99
# state-preprocessor=Our default Atari Conv2D-net.

View file

@ -1,13 +1,16 @@
# Pendulum SAC can attain -150+ reward in 6-7k
# Configurations are the similar to original softlearning/sac codebase
pendulum_sac:
pendulum-sac:
env: Pendulum-v0
run: SAC
stop:
episode_reward_mean: -150
episode_reward_mean: -300
timesteps_total: 10000
config:
# Works for both torch and tf.
use_pytorch: false
horizon: 200
soft_horizon: False
soft_horizon: true
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
@ -16,10 +19,10 @@ pendulum_sac:
fcnet_hiddens: [256, 256]
tau: 0.005
target_entropy: auto
no_done_at_end: True
no_done_at_end: true
n_step: 1
rollout_fragment_length: 1
prioritized_replay: False
prioritized_replay: true
train_batch_size: 256
target_network_update_freq: 1
timesteps_per_iteration: 1000
@ -31,6 +34,6 @@ pendulum_sac:
num_workers: 0
num_gpus: 0
clip_actions: False
normalize_actions: True
normalize_actions: true
evaluation_interval: 1
metrics_smoothing_episodes: 5

View file

@ -43,7 +43,7 @@ class PiecewiseSchedule(Schedule):
assert idxes == sorted(idxes)
self.interpolation = interpolation
self.outside_value = outside_value
self.endpoints = endpoints
self.endpoints = [(int(e[0]), float(e[1])) for e in endpoints]
@override(Schedule)
def _value(self, t):