mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[RLlib] Benchmark and regression test yaml cleanup and restructuring. (#8414)
This commit is contained in:
parent
ae2e1f0883
commit
baa053496a
89 changed files with 614 additions and 584 deletions
366
rllib/BUILD
366
rllib/BUILD
|
@ -37,70 +37,120 @@
|
|||
# Tag: learning_tests
|
||||
#
|
||||
# This will test all yaml files (via `rllib train`)
|
||||
# inside rllib/tuned_examples/regression_tests for actual learning success.
|
||||
# inside rllib/tuned_examples/[algo-name] for actual learning success.
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
# A2C/A3C
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_pg_a3c_tf",
|
||||
name = "regression_test_a2c_cartpole_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-pg-tf.yaml",
|
||||
"tuned_examples/regression_tests/cartpole-a3c-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/a3c"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "regression_test_a2c_cartpole_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/a3c/cartpole-a2c.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "regression_test_a3c_cartpole_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/a3c"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "regression_test_a3c_cartpole_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/a3c/cartpole-a3c.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/a3c", "--torch"]
|
||||
)
|
||||
|
||||
# APPO
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_appo_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-appo-tf.yaml",
|
||||
"tuned_examples/ppo/cartpole-appo.yaml",
|
||||
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
args = ["--yaml-dir=tuned_examples/ppo"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_appo_vtrace_tf",
|
||||
name = "run_regression_tests_cartpole_appo_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-appo-vtrace-tf.yaml",
|
||||
"tuned_examples/ppo/cartpole-appo.yaml",
|
||||
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_es_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-es-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
|
||||
)
|
||||
|
||||
# ARS
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ars_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-ars-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/ars/cartpole-ars.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ars"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ars_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ars/cartpole-ars.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ars", "--torch"]
|
||||
)
|
||||
|
||||
# DDPG
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_ddpg_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
|
||||
args = ["--yaml-dir=tuned_examples/ddpg"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_ddpg_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
|
||||
args = ["--torch", "--yaml-dir=tuned_examples/ddpg"]
|
||||
)
|
||||
|
||||
# DQN/Simple-Q
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_dqn_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
|
@ -108,95 +158,11 @@ py_test(
|
|||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-simpleq-tf.yaml",
|
||||
"tuned_examples/regression_tests/cartpole-dqn-tf.yaml",
|
||||
"tuned_examples/regression_tests/cartpole-dqn-param-noise-tf.yaml",
|
||||
"tuned_examples/dqn/cartpole-simpleq.yaml",
|
||||
"tuned_examples/dqn/cartpole-dqn.yaml",
|
||||
"tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_impala_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-impala-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_sac_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-sac-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ppo_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-ppo-tf.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_a2c_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-a2c-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_appo_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-appo-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_appo_vtrace_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-appo-vtrace-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ars_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-ars-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
args = ["--yaml-dir=tuned_examples/dqn"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
|
@ -206,91 +172,177 @@ py_test(
|
|||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-dqn-param-noise-torch.yaml"
|
||||
"tuned_examples/dqn/cartpole-simpleq.yaml",
|
||||
"tuned_examples/dqn/cartpole-dqn.yaml",
|
||||
"tuned_examples/dqn/cartpole-dqn-param-noise.yaml",
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
args = ["--yaml-dir=tuned_examples/dqn", "--torch"]
|
||||
)
|
||||
|
||||
# ES
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_es_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/es/cartpole-es.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/es"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_es_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-es-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/es/cartpole-es.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/es", "--torch"]
|
||||
)
|
||||
|
||||
# IMPALA
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_impala_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/impala/cartpole-impala.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/impala"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_impala_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-impala-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/impala/cartpole-impala.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/impala", "--torch"]
|
||||
)
|
||||
|
||||
# PG
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_pg_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/pg/cartpole-pg.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/pg"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_pg_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-pg-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/pg/cartpole-pg.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/pg", "--torch"]
|
||||
)
|
||||
|
||||
# PPO
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ppo_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ppo"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_ppo_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ppo/cartpole-ppo.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ppo", "--torch"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_ppo_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-ppo-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ppo"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_ppo_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ppo/pendulum-ppo.yaml"],
|
||||
args = ["--torch", "--yaml-dir=tuned_examples/ppo"]
|
||||
)
|
||||
|
||||
# SAC
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_sac_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_cartpole"],
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/sac/cartpole-sac.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/sac"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_cartpole_sac_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_cartpole"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = [
|
||||
"tuned_examples/regression_tests/cartpole-sac-torch.yaml"
|
||||
],
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/sac/cartpole-sac.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_tf",
|
||||
name = "run_regression_tests_pendulum_sac_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_pendulum"],
|
||||
size = "enormous", # = 60min timeout
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = glob(["tuned_examples/regression_tests/pendulum-*-tf.yaml"]),
|
||||
# Pass `BAZEL` option and the path to look for yaml regression files.
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/sac/pendulum-sac.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/sac"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_torch",
|
||||
name = "run_regression_tests_pendulum_sac_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_pendulum"],
|
||||
size = "enormous", # = 60min timeout
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = glob(["tuned_examples/regression_tests/pendulum-*-torch.yaml"]),
|
||||
# Pass `BAZEL` option and the path to look for yaml regression files.
|
||||
args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
data = ["tuned_examples/sac/pendulum-sac.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
|
||||
)
|
||||
|
||||
# TD3
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_td3_tf",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_tf", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ddpg"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_td3_torch",
|
||||
main = "tests/run_regression_tests.py",
|
||||
tags = ["learning_tests_torch", "learning_tests_pendulum"],
|
||||
size = "large",
|
||||
srcs = ["tests/run_regression_tests.py"],
|
||||
data = ["tuned_examples/ddpg/pendulum-td3.yaml"],
|
||||
args = ["--yaml-dir=tuned_examples/ddpg", "--torch"]
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
|
|
@ -200,7 +200,8 @@ def build_ddpg_stats(policy, batch):
|
|||
"mean_q": torch.mean(policy.q_t),
|
||||
"max_q": torch.max(policy.q_t),
|
||||
"min_q": torch.min(policy.q_t),
|
||||
"td_error": policy.td_error
|
||||
"mean_td_error": torch.mean(policy.td_error),
|
||||
"td_error": policy.td_error,
|
||||
}
|
||||
return stats
|
||||
|
||||
|
|
|
@ -23,7 +23,9 @@ class TestDDPG(unittest.TestCase):
|
|||
"""Test whether a DDPGTrainer can be built with both frameworks."""
|
||||
config = ddpg.DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 0 # Run locally.
|
||||
config["num_envs_per_worker"] = 2 # Run locally.
|
||||
config["num_envs_per_worker"] = 2
|
||||
config["learning_starts"] = 0
|
||||
config["exploration_config"]["random_timesteps"] = 100
|
||||
|
||||
num_iterations = 2
|
||||
|
||||
|
|
|
@ -9,13 +9,15 @@
|
|||
# name = "run_regression_tests",
|
||||
# main = "tests/run_regression_tests.py",
|
||||
# tags = ["learning_tests"],
|
||||
# size = "enormous", # = 60min timeout
|
||||
# size = "medium", # 5min timeout
|
||||
# srcs = ["tests/run_regression_tests.py"],
|
||||
# data = glob(["tuned_examples/regression_tests/*.yaml"]),
|
||||
# Pass `BAZEL` option and the path to look for yaml regression files.
|
||||
# # Pass `BAZEL` option and the path to look for yaml regression files.
|
||||
# args = ["BAZEL", "tuned_examples/regression_tests"]
|
||||
# )
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import yaml
|
||||
|
@ -24,30 +26,51 @@ import ray
|
|||
from ray.tune import run_experiments
|
||||
from ray.rllib import _register_all
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Bazel regression test mode: Get path to look for yaml files from argv[2].
|
||||
if sys.argv[1] == "BAZEL":
|
||||
# Get the path to use.
|
||||
rllib_dir = Path(__file__).parent.parent
|
||||
print("rllib dir={}".format(rllib_dir))
|
||||
yaml_files = rllib_dir.rglob(sys.argv[2] + "/*.yaml")
|
||||
yaml_files = sorted(
|
||||
map(lambda path: str(path.absolute()), yaml_files), reverse=True)
|
||||
# Normal mode: Get yaml files to run from command line.
|
||||
else:
|
||||
yaml_files = sys.argv[1:]
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--torch",
|
||||
action="store_true",
|
||||
help="Runs all tests with PyTorch enabled.")
|
||||
parser.add_argument(
|
||||
"--yaml-dir",
|
||||
type=str,
|
||||
help="The directory in which to find all yamls to test.")
|
||||
|
||||
print("Will run the following regression files:")
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# Bazel regression test mode: Get path to look for yaml files from argv[2].
|
||||
# Get the path or single file to use.
|
||||
rllib_dir = Path(__file__).parent.parent
|
||||
print("rllib dir={}".format(rllib_dir))
|
||||
|
||||
if not os.path.isdir(os.path.join(rllib_dir, args.yaml_dir)):
|
||||
raise ValueError("yaml-dir ({}) not found!".format(args.yaml_dir))
|
||||
|
||||
yaml_files = rllib_dir.rglob(args.yaml_dir + "/*.yaml")
|
||||
yaml_files = sorted(
|
||||
map(lambda path: str(path.absolute()), yaml_files), reverse=True)
|
||||
|
||||
print("Will run the following regression tests:")
|
||||
for yaml_file in yaml_files:
|
||||
print("->", yaml_file)
|
||||
|
||||
# Loop through all collected files.
|
||||
for yaml_file in yaml_files:
|
||||
experiments = yaml.load(open(yaml_file).read())
|
||||
assert len(experiments) == 1,\
|
||||
"Error, can only run a single experiment per yaml file!"
|
||||
|
||||
print("== Test config ==")
|
||||
print(yaml.dump(experiments))
|
||||
|
||||
# Add torch option to exp configs.
|
||||
for exp in experiments.values():
|
||||
if args.torch:
|
||||
exp["config"]["use_pytorch"] = True
|
||||
|
||||
# Try running each test 3 times and make sure it reaches the given
|
||||
# reward.
|
||||
passed = False
|
||||
for i in range(3):
|
||||
try:
|
||||
|
|
|
@ -9,6 +9,7 @@ atari-a2c:
|
|||
- SpaceInvadersNoFrameskip-v4
|
||||
run: A2C
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
rollout_fragment_length: 20
|
||||
clip_rewards: True
|
||||
num_workers: 5
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-a2c-microbatch-tf:
|
||||
cartpole-a2c-microbatch:
|
||||
env: CartPole-v0
|
||||
run: A2C
|
||||
stop:
|
||||
episode_reward_mean: 100
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 1
|
||||
gamma: 0.95
|
11
rllib/tuned_examples/a3c/cartpole-a2c.yaml
Normal file
11
rllib/tuned_examples/a3c/cartpole-a2c.yaml
Normal file
|
@ -0,0 +1,11 @@
|
|||
cartpole-a2c:
|
||||
env: CartPole-v0
|
||||
run: A2C
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 0
|
||||
lr: 0.001
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-a3c-tf:
|
||||
cartpole-a3c:
|
||||
env: CartPole-v0
|
||||
run: A3C
|
||||
stop:
|
||||
episode_reward_mean: 100
|
||||
timesteps_total: 100000
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 1
|
||||
gamma: 0.95
|
|
@ -4,9 +4,10 @@ pong-a3c:
|
|||
env: PongDeterministic-v4
|
||||
run: A3C
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 16
|
||||
rollout_fragment_length: 20
|
||||
use_pytorch: false
|
||||
vf_loss_coeff: 0.5
|
||||
entropy_coeff: 0.01
|
||||
gamma: 0.99
|
|
@ -1,11 +1,12 @@
|
|||
cartpole-ars-torch:
|
||||
cartpole-ars:
|
||||
env: CartPole-v0
|
||||
run: ARS
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: true
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
noise_stdev: 0.02
|
||||
num_rollouts: 50
|
||||
rollouts_used: 25
|
||||
|
@ -13,5 +14,3 @@ cartpole-ars-torch:
|
|||
sgd_stepsize: 0.01
|
||||
noise_size: 25000000
|
||||
eval_prob: 0.5
|
||||
model:
|
||||
fcnet_hiddens: [64, 64]
|
|
@ -3,6 +3,8 @@ swimmer-ars:
|
|||
env: Swimmer-v2
|
||||
run: ARS
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
noise_stdev: 0.01
|
||||
num_rollouts: 1
|
||||
rollouts_used: 1
|
|
@ -1,13 +0,0 @@
|
|||
# To generate training data, first run:
|
||||
# $ ./train.py --run=PPO --env=CartPole-v0 \
|
||||
# --stop='{"timesteps_total": 50000}' \
|
||||
# --config='{"use_pytorch": true, "output": "/tmp/out", "batch_mode": "complete_episodes"}'
|
||||
cartpole-marwil-torch:
|
||||
env: CartPole-v0
|
||||
run: MARWIL
|
||||
stop:
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
beta:
|
||||
grid_search: [0, 1] # compare IL (beta=0) vs MARWIL
|
||||
input: /tmp/out
|
169
rllib/tuned_examples/cleanup_experiment.py
Normal file
169
rllib/tuned_examples/cleanup_experiment.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
"""
|
||||
This script automates cleaning up a benchmark/experiment run of some algo
|
||||
against some config (with possibly more than one tune trial,
|
||||
e.g. torch=grid_search([True, False])).
|
||||
|
||||
Run `python cleanup_experiment.py --help` for more information.
|
||||
|
||||
Use on an input directory with trial contents e.g.:
|
||||
..
|
||||
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_10-17-54topr3h9k
|
||||
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_13-59-35dqaetxnf
|
||||
IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_17-21-28tbhedw72
|
||||
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_10-17-54lv20cgn_
|
||||
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_13-59-35kwzhax_y
|
||||
IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_17-21-28a5j0s7za
|
||||
|
||||
Then run:
|
||||
>> python cleanup_experiment.py --experiment-dir [parent dir w/ trial sub-dirs]
|
||||
>> --output-dir [your out dir] --results-filter dumb_col_2,superfluous_col3
|
||||
>> --results-max-size [max results file size in kb before(!) zipping]
|
||||
|
||||
The script will create one output sub-dir for each trial and only copy
|
||||
the configuration and the csv results (filtered and every nth row removed
|
||||
based on the given args).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import yaml
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--experiment-dir",
|
||||
type=str,
|
||||
help="Experiment dir in which all sub-runs (seeds) are "
|
||||
"located (as sub-dirs). Each sub0-run dir must contain the files: "
|
||||
"params.json and progress.csv.")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="The output dir, in which the cleaned up output will be placed.")
|
||||
parser.add_argument(
|
||||
"--results-filter",
|
||||
type=str,
|
||||
help="comma-separated list of csv fields to exclude.",
|
||||
default="experiment_id,pid,hostname,node_ip,trial_id,hist_stats/episode_"
|
||||
"reward,hist_stats/episode_lengths,experiment_tag")
|
||||
parser.add_argument(
|
||||
"--results-max-size",
|
||||
type=int,
|
||||
help="the max. size of the final results.csv file (in kb). Will erase "
|
||||
"every nth line in the original input to reach that goal. "
|
||||
"Use 0 for no limit (default=100).",
|
||||
default=100)
|
||||
|
||||
|
||||
def process_single_run(in_dir, out_dir):
|
||||
exp_dir = os.listdir(in_dir)
|
||||
|
||||
# Make sure trials dir is ok.
|
||||
assert "params.json" in exp_dir and "progress.csv" in exp_dir, \
|
||||
"params.json or progress.csv not found in {}!".format(in_dir)
|
||||
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
for file in exp_dir:
|
||||
absfile = os.path.join(in_dir, file)
|
||||
# Config file -> Convert to yaml and move to output dir.
|
||||
if file == "params.json":
|
||||
assert os.path.isfile(absfile), "{} not a file!".format(file)
|
||||
with open(absfile) as fp:
|
||||
contents = json.load(fp)
|
||||
with open(os.path.join(out_dir, "config.yaml"), "w") as fp:
|
||||
yaml.dump(contents, fp)
|
||||
# Progress csv file -> Filter out some columns, cut, and write to
|
||||
# output_dir.
|
||||
elif file == "progress.csv":
|
||||
assert os.path.isfile(absfile), "{} not a file!".format(file)
|
||||
col_idx_to_filter = []
|
||||
with open(absfile) as fp:
|
||||
# Get column names.
|
||||
col_names_orig = fp.readline().strip().split(",")
|
||||
# Split by comma (abiding to quotes), filter out
|
||||
# unwanted columns, then write to disk.
|
||||
cols_to_filter = args.results_filter.split(",")
|
||||
for i, c in enumerate(col_names_orig):
|
||||
if c in cols_to_filter:
|
||||
col_idx_to_filter.insert(0, i)
|
||||
col_names = col_names_orig.copy()
|
||||
for idx in col_idx_to_filter:
|
||||
col_names.pop(idx)
|
||||
absfile_out = os.path.join(out_dir, "progress.csv")
|
||||
with open(absfile_out, "w") as out_fp:
|
||||
print(",".join(col_names), file=out_fp)
|
||||
while True:
|
||||
line = fp.readline().strip()
|
||||
if not line:
|
||||
break
|
||||
line = re.sub(
|
||||
"(,{2,})",
|
||||
lambda m: ",None" * (len(m.group()) - 1) + ",",
|
||||
line)
|
||||
cols = re.findall('".+?"|[^,]+', line)
|
||||
if len(cols) != len(col_names_orig):
|
||||
continue
|
||||
for idx in col_idx_to_filter:
|
||||
cols.pop(idx)
|
||||
print(",".join(cols), file=out_fp)
|
||||
|
||||
# Reduce the size of the output file if necessary.
|
||||
out_size = os.path.getsize(absfile_out)
|
||||
max_size = args.results_max_size * 1024
|
||||
if 0 < max_size < out_size:
|
||||
# Figure out roughly every which line we have to drop.
|
||||
ratio = out_size / max_size
|
||||
# If ratio > 2.0, we'll have to keep only every nth line.
|
||||
if ratio > 2.0:
|
||||
nth = out_size // max_size
|
||||
os.system("awk 'NR==1||NR%{}==0' {} > {}.new".format(
|
||||
nth, absfile_out, absfile_out))
|
||||
# If ratio < 2.0 (>1.0), we'll have to drop every nth line.
|
||||
else:
|
||||
nth = out_size // (out_size - max_size)
|
||||
os.system("awk 'NR==1||NR%{}!=0' {} > {}.new".format(
|
||||
nth, absfile_out, absfile_out))
|
||||
os.remove(absfile_out)
|
||||
os.rename(absfile_out + ".new", absfile_out)
|
||||
|
||||
# Zip progress.csv into results.zip.
|
||||
zip_file = os.path.join(out_dir, "results.zip")
|
||||
try:
|
||||
os.remove(zip_file)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
os.system("zip -j {} {}".format(
|
||||
zip_file, os.path.join(out_dir, "progress.csv")))
|
||||
os.remove(os.path.join(out_dir, "progress.csv"))
|
||||
|
||||
# TBX events file -> Move as is.
|
||||
elif re.search("^(events\\.out\\.|params\\.pkl)", file):
|
||||
assert os.path.isfile(absfile), "{} not a file!".format(file)
|
||||
shutil.copyfile(absfile, os.path.join(out_dir, file))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
exp_dir = os.listdir(args.experiment_dir)
|
||||
# Loop through all sub-directories.
|
||||
for i, sub_run in enumerate(sorted(exp_dir)):
|
||||
abspath = os.path.join(args.experiment_dir, sub_run)
|
||||
# This is a seed run.
|
||||
if os.path.isdir(abspath) and \
|
||||
re.search("^(\\w+?)_(\\w+?-v\\d+)(_\\d+)", sub_run):
|
||||
# Create meaningful output dir name:
|
||||
# [algo]_[env]_[trial #]_[trial-config]_[date YYYY-MM-DD].
|
||||
cleaned_up_out = re.sub(
|
||||
"^(\\w+?)_(\\w+?-v\\d+)(_\\d+)(_.+)?(_\\d{4}-\\d{2}-\\d{2})"
|
||||
"_\\d{2}-\\d{2}-\\w+", "{:02}_\\1_\\2\\4\\5".format(i),
|
||||
sub_run)
|
||||
# Remove superflous `env=` specifier (anv always included in name).
|
||||
cleaned_up_out = re.sub("^(.+)env=\\w+?-v\\d+,?(.+)", "\\1\\2",
|
||||
cleaned_up_out)
|
||||
out_path = os.path.join(args.output_dir, cleaned_up_out)
|
||||
process_single_run(abspath, out_path)
|
||||
# Done.
|
||||
print("done")
|
5
rllib/tuned_examples/create_plots.py
Normal file
5
rllib/tuned_examples/create_plots.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
# TODO(sven):
|
||||
# Add a simple script that takes n csv input files and generates plot(s)
|
||||
# from these with: x-axis=ts OR wall-time; y-axis=any metric(s) (up to 2).
|
||||
# ability to merge any m csv files (e.g. tf vs torch; or n seeds) together
|
||||
# in one plot.
|
|
@ -6,6 +6,7 @@ halfcheetah-ddpg:
|
|||
episode_reward_mean: 2000
|
||||
time_total_s: 5400 # 90 minutes
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
# === Model ===
|
||||
actor_hiddens: [64, 64]
|
||||
critic_hiddens: [64, 64]
|
|
@ -9,6 +9,8 @@ invertedpendulum-td3:
|
|||
time_total_s: 900 # 15 minutes
|
||||
timesteps_total: 1000000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
# === Model ===
|
||||
actor_hiddens: [32, 32]
|
||||
critic_hiddens: [32, 32]
|
|
@ -5,6 +5,8 @@ mountaincarcontinuous-apex-ddpg:
|
|||
stop:
|
||||
episode_reward_mean: 90
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
clip_rewards: False
|
||||
num_workers: 16
|
||||
exploration_config:
|
|
@ -6,6 +6,8 @@ mountaincarcontinuous-ddpg:
|
|||
episode_reward_mean: 90
|
||||
time_total_s: 600 # 10 minutes
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
# === Model ===
|
||||
actor_hiddens: [32, 64]
|
||||
critic_hiddens: [64, 64]
|
|
@ -15,6 +15,8 @@ mujoco-td3:
|
|||
stop:
|
||||
timesteps_total: 1000000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
# === Exploration ===
|
||||
learning_starts: 10000
|
||||
exploration_config:
|
|
@ -5,6 +5,8 @@ pendulum-apex-ddpg:
|
|||
stop:
|
||||
episode_reward_mean: -160
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
use_huber: True
|
||||
clip_rewards: False
|
||||
num_workers: 16
|
|
@ -1,11 +1,13 @@
|
|||
# This configuration can expect to reach -160 reward in 10k-20k timesteps
|
||||
# This configuration can expect to reach -160 reward in 10k-20k timesteps.
|
||||
pendulum-ddpg:
|
||||
env: Pendulum-v0
|
||||
run: DDPG
|
||||
stop:
|
||||
episode_reward_mean: -160
|
||||
timesteps_total: 100000
|
||||
episode_reward_mean: -900
|
||||
timesteps_total: 20000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
# === Model ===
|
||||
actor_hiddens: [64, 64]
|
||||
critic_hiddens: [64, 64]
|
||||
|
@ -18,7 +20,7 @@ pendulum-ddpg:
|
|||
exploration_config:
|
||||
type: "OrnsteinUhlenbeckNoise"
|
||||
scale_timesteps: 10000
|
||||
initial_scale: 1.0,
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
|
@ -1,20 +1,20 @@
|
|||
# This configuration can expect to reach -160 reward in 10k-20k timesteps
|
||||
pendulum-ddpg:
|
||||
pendulum-td3:
|
||||
env: Pendulum-v0
|
||||
run: TD3
|
||||
stop:
|
||||
episode_reward_mean: -130
|
||||
time_total_s: 900 # 10 minutes
|
||||
episode_reward_mean: -900
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
# === Model ===
|
||||
actor_hiddens: [64, 64]
|
||||
critic_hiddens: [64, 64]
|
||||
|
||||
# === Exploration ===
|
||||
learning_starts: 5000
|
||||
exploration_config:
|
||||
random_timesteps: 5000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 1
|
||||
evaluation_num_episodes: 5
|
|
@ -8,6 +8,7 @@ apex:
|
|||
- SpaceInvadersNoFrameskip-v4
|
||||
run: APEX
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
double_q: false
|
||||
dueling: false
|
||||
num_atoms: 1
|
|
@ -9,6 +9,7 @@ atari-basic-dqn:
|
|||
- SpaceInvadersNoFrameskip-v4
|
||||
run: DQN
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
double_q: false
|
||||
dueling: false
|
||||
num_atoms: 1
|
|
@ -9,6 +9,7 @@ dueling-ddqn:
|
|||
- SpaceInvadersNoFrameskip-v4
|
||||
run: DQN
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
double_q: true
|
||||
dueling: true
|
||||
num_atoms: 1
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-dqn-tf-w-param-noise:
|
||||
cartpole-dqn-w-param-noise:
|
||||
env: CartPole-v0
|
||||
run: DQN
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 300000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
exploration_config:
|
||||
type: ParameterNoise
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-dqn-tf:
|
||||
cartpole-dqn:
|
||||
env: CartPole-v0
|
||||
run: DQN
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
n_step: 3
|
||||
gamma: 0.95
|
|
@ -1,8 +1,9 @@
|
|||
cartpole-dqn-tf:
|
||||
cartpole-dqn:
|
||||
env: CartPole-v0
|
||||
run: SimpleQ
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
|
@ -6,6 +6,7 @@ pong-apex:
|
|||
env: PongNoFrameskip-v4
|
||||
run: APEX
|
||||
config:
|
||||
use_pytorch: false
|
||||
target_network_update_freq: 20000
|
||||
num_workers: 4
|
||||
num_envs_per_worker: 8
|
|
@ -6,6 +6,8 @@ pong-deterministic-dqn:
|
|||
episode_reward_mean: 20
|
||||
time_total_s: 7200
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_gpus: 1
|
||||
gamma: 0.99
|
||||
lr: .0001
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-es-tf:
|
||||
cartpole-es:
|
||||
env: CartPole-v0
|
||||
run: ES
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 2
|
||||
noise_size: 25000000
|
9
rllib/tuned_examples/es/humanoid-es.yaml
Normal file
9
rllib/tuned_examples/es/humanoid-es.yaml
Normal file
|
@ -0,0 +1,9 @@
|
|||
humanoid-v2-es:
|
||||
env: Humanoid-v2
|
||||
run: ES
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 100
|
|
@ -1,7 +0,0 @@
|
|||
humanoid-es:
|
||||
env: Humanoid-v1
|
||||
run: ES
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
num_workers: 100
|
|
@ -5,5 +5,6 @@ cartpole-impala-tf:
|
|||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_gpus: 0
|
6
rllib/tuned_examples/impala/pendulum-impala.yaml
Normal file
6
rllib/tuned_examples/impala/pendulum-impala.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
pendulum-impala-tf:
|
||||
env: Pendulum-v0
|
||||
run: IMPALA
|
||||
stop:
|
||||
episode_reward_mean: -700
|
||||
timesteps_total: 500000
|
|
@ -2,12 +2,13 @@
|
|||
# $ ./train.py --run=PPO --env=CartPole-v0 \
|
||||
# --stop='{"timesteps_total": 50000}' \
|
||||
# --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
|
||||
cartpole-marwil-tf:
|
||||
cartpole-marwil:
|
||||
env: CartPole-v0
|
||||
run: MARWIL
|
||||
stop:
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
beta:
|
||||
grid_search: [0, 1] # compare IL (beta=0) vs MARWIL
|
||||
input: /tmp/out
|
|
@ -1,21 +0,0 @@
|
|||
pendulum-appo-vtrace-torch:
|
||||
env: Pendulum-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: -1000 # just check it learns a bit
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: true
|
||||
vtrace: true
|
||||
num_gpus: 0
|
||||
num_workers: 1
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
lr: 0.0003
|
||||
train_batch_size: 100
|
||||
minibatch_buffer_size: 16
|
||||
num_sgd_iter: 10
|
||||
model:
|
||||
fcnet_hiddens: [256, 256]
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: MeanStdFilter
|
|
@ -1,18 +0,0 @@
|
|||
# can expect improvement to -140 reward in ~300-500k timesteps
|
||||
pendulum-ppo:
|
||||
env: Pendulum-v0
|
||||
run: PPO
|
||||
config:
|
||||
train_batch_size: 2048
|
||||
vf_clip_param: 10.0
|
||||
num_workers: 0
|
||||
num_envs_per_worker: 10
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
lr: 0.0003
|
||||
sgd_minibatch_size: 64
|
||||
num_sgd_iter: 10
|
||||
model:
|
||||
fcnet_hiddens: [64, 64]
|
||||
batch_mode: complete_episodes
|
||||
observation_filter: MeanStdFilter
|
|
@ -1,9 +1,10 @@
|
|||
cartpole-pg-torch:
|
||||
cartpole-pg:
|
||||
env: CartPole-v0
|
||||
run: PG
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 0
|
||||
use_pytorch: true
|
|
@ -1,21 +0,0 @@
|
|||
pong-a3c-pytorch-cnn:
|
||||
env: PongDeterministic-v4
|
||||
run: A3C
|
||||
config:
|
||||
num_workers: 16
|
||||
rollout_fragment_length: 20
|
||||
use_pytorch: true
|
||||
vf_loss_coeff: 0.5
|
||||
entropy_coeff: 0.01
|
||||
gamma: 0.99
|
||||
grad_clip: 40.0
|
||||
lambda: 1.0
|
||||
lr: 0.0001
|
||||
observation_filter: NoFilter
|
||||
model:
|
||||
use_lstm: false
|
||||
dim: 84
|
||||
grayscale: true
|
||||
zero_mean: false
|
||||
optimizer:
|
||||
grads_per_step: 1000
|
|
@ -7,6 +7,7 @@ atari-ddppo:
|
|||
- BreakoutNoFrameskip-v4
|
||||
run: DDPPO
|
||||
config:
|
||||
use_pytorch: true # DDPPO only supports PyTorch so far
|
||||
# Worker config: 10 workers, each of which requires a GPU.
|
||||
num_workers: 10
|
||||
num_gpus_per_worker: 1
|
|
@ -9,6 +9,7 @@ atari-ppo:
|
|||
- SpaceInvadersNoFrameskip-v4
|
||||
run: PPO
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
lambda: 0.95
|
||||
kl_coeff: 0.5
|
||||
clip_rewards: True
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-appo-vtrace-tf:
|
||||
cartpole-appo-vtrace:
|
||||
env: CartPole-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
rollout_fragment_length: 10
|
||||
train_batch_size: 10
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-appo-tf:
|
||||
cartpole-appo:
|
||||
env: CartPole-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
rollout_fragment_length: 10
|
||||
train_batch_size: 10
|
|
@ -1,9 +1,8 @@
|
|||
cartpole-ddppo-torch:
|
||||
cartpole-ddppo:
|
||||
env: CartPole-v0
|
||||
run: DDPPO
|
||||
stop:
|
||||
episode_reward_mean: 100
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
use_pytorch: true
|
||||
num_gpus_per_worker: 0
|
|
@ -5,6 +5,7 @@ cartpole-ppo:
|
|||
episode_reward_mean: 200
|
||||
time_total_s: 180
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
num_workers: 2
|
||||
num_sgd_iter:
|
||||
grid_search: [1, 4]
|
|
@ -6,6 +6,8 @@ cartpole-ppo:
|
|||
episode_reward_mean: 200
|
||||
time_total_s: 180
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
num_workers: 1
|
||||
num_sgd_iter:
|
||||
grid_search: [1, 4]
|
|
@ -1,11 +1,12 @@
|
|||
cartpole-ppo-torch:
|
||||
cartpole-ppo:
|
||||
env: CartPole-v0
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
use_pytorch: true
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
|
@ -1,11 +1,12 @@
|
|||
# This can reach 9k reward in 2 hours on a Titan XP GPU
|
||||
# This can reach 9k reward in 2 hours on a Titan XP GPU
|
||||
# with 16 workers and 8 envs per worker.
|
||||
halfcheetah-appo:
|
||||
env: HalfCheetah-v2
|
||||
run: APPO
|
||||
stop:
|
||||
time_total_s: 10800
|
||||
time_total_s: 10800
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
vtrace: True
|
||||
gamma: 0.99
|
||||
lambda: 0.95
|
||||
|
@ -30,6 +31,6 @@ halfcheetah-appo:
|
|||
batch_mode: truncate_episodes
|
||||
use_kl_loss: True
|
||||
kl_coeff: 1.0
|
||||
kl_target: 0.04
|
||||
kl_target: 0.04
|
||||
observation_filter: MeanStdFilter
|
||||
|
|
@ -1,23 +1,24 @@
|
|||
halfcheetah-ppo:
|
||||
env: HalfCheetah-v2
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 9800
|
||||
time_total_s: 10800
|
||||
config:
|
||||
gamma: 0.99
|
||||
lambda: 0.95
|
||||
kl_coeff: 1.0
|
||||
num_sgd_iter: 32
|
||||
lr: .0003
|
||||
vf_loss_coeff: 0.5
|
||||
clip_param: 0.2
|
||||
sgd_minibatch_size: 4096
|
||||
train_batch_size: 65536
|
||||
num_workers: 16
|
||||
num_gpus: 1
|
||||
grad_clip: 0.5
|
||||
num_envs_per_worker:
|
||||
grid_search: [16, 32]
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: MeanStdFilter
|
||||
halfcheetah-ppo:
|
||||
env: HalfCheetah-v2
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 9800
|
||||
time_total_s: 10800
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
gamma: 0.99
|
||||
lambda: 0.95
|
||||
kl_coeff: 1.0
|
||||
num_sgd_iter: 32
|
||||
lr: .0003
|
||||
vf_loss_coeff: 0.5
|
||||
clip_param: 0.2
|
||||
sgd_minibatch_size: 4096
|
||||
train_batch_size: 65536
|
||||
num_workers: 16
|
||||
num_gpus: 1
|
||||
grad_clip: 0.5
|
||||
num_envs_per_worker:
|
||||
grid_search: [16, 32]
|
||||
batch_mode: truncate_episodes
|
||||
observation_filter: MeanStdFilter
|
|
@ -2,6 +2,8 @@ hopper-ppo:
|
|||
env: Hopper-v1
|
||||
run: PPO
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.995
|
||||
kl_coeff: 1.0
|
||||
num_sgd_iter: 20
|
|
@ -4,6 +4,8 @@ humanoid-ppo-gae:
|
|||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.995
|
||||
lambda: 0.95
|
||||
clip_param: 0.2
|
|
@ -4,6 +4,8 @@ humanoid-ppo:
|
|||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.995
|
||||
kl_coeff: 1.0
|
||||
num_sgd_iter: 20
|
|
@ -1,10 +1,11 @@
|
|||
pendulum-appo-vtrace-tf:
|
||||
pendulum-appo-vtrace:
|
||||
env: Pendulum-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: -1000 # just check it learns a bit
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
vtrace: true
|
||||
num_gpus: 0
|
|
@ -1,10 +1,12 @@
|
|||
pendulum-ppo-tf:
|
||||
# Can expect improvement to -140 reward in ~300-500k timesteps.
|
||||
pendulum-ppo:
|
||||
env: Pendulum-v0
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: -500
|
||||
timesteps_total: 400000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
train_batch_size: 2048
|
||||
vf_clip_param: 10.0
|
|
@ -1,29 +1,31 @@
|
|||
# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
|
||||
# with 32 workers and 8 envs per worker. IMPALA, when ran with
|
||||
# similar configurations, solved Pong in 10-12 minutes.
|
||||
# APPO can also solve Pong in 2.5 million timesteps, which is
|
||||
# 2x more efficient than that of IMPALA.
|
||||
pong-appo:
|
||||
env: PongNoFrameskip-v4
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 18.0
|
||||
timesteps_total: 5000000
|
||||
config:
|
||||
vtrace: True
|
||||
use_kl_loss: False
|
||||
rollout_fragment_length: 50
|
||||
train_batch_size: 750
|
||||
num_workers: 32
|
||||
broadcast_interval: 1
|
||||
max_sample_requests_in_flight_per_worker: 1
|
||||
num_data_loader_buffers: 1
|
||||
num_envs_per_worker: 8
|
||||
minibatch_buffer_size: 4
|
||||
num_sgd_iter: 2
|
||||
vf_loss_coeff: 1.0
|
||||
clip_param: 0.3
|
||||
num_gpus: 1
|
||||
grad_clip: 10
|
||||
model:
|
||||
dim: 42
|
||||
# This can reach 18-19 reward in ~5-7 minutes on a Titan XP GPU
|
||||
# with 32 workers and 8 envs per worker. IMPALA, when ran with
|
||||
# similar configurations, solved Pong in 10-12 minutes.
|
||||
# APPO can also solve Pong in 2.5 million timesteps, which is
|
||||
# 2x more efficient than that of IMPALA.
|
||||
pong-appo:
|
||||
env: PongNoFrameskip-v4
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 18.0
|
||||
timesteps_total: 5000000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
vtrace: True
|
||||
use_kl_loss: False
|
||||
rollout_fragment_length: 50
|
||||
train_batch_size: 750
|
||||
num_workers: 32
|
||||
broadcast_interval: 1
|
||||
max_sample_requests_in_flight_per_worker: 1
|
||||
num_data_loader_buffers: 1
|
||||
num_envs_per_worker: 8
|
||||
minibatch_buffer_size: 4
|
||||
num_sgd_iter: 2
|
||||
vf_loss_coeff: 1.0
|
||||
clip_param: 0.3
|
||||
num_gpus: 1
|
||||
grad_clip: 10
|
||||
model:
|
||||
dim: 42
|
|
@ -1,11 +1,13 @@
|
|||
# On a single GPU, this achieves maximum reward in ~15-20 minutes.
|
||||
#
|
||||
# $ python train.py -f tuned_examples/pong-ppo.yaml
|
||||
# $ python train.py -f tuned_configs/pong-ppo.yaml
|
||||
#
|
||||
pong-ppo:
|
||||
env: PongNoFrameskip-v4
|
||||
run: PPO
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
lambda: 0.95
|
||||
kl_coeff: 0.5
|
||||
clip_rewards: True
|
|
@ -2,6 +2,8 @@ walker2d-v1-ppo:
|
|||
env: Walker2d-v1
|
||||
run: PPO
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
kl_coeff: 1.0
|
||||
num_sgd_iter: 20
|
||||
lr: .0001
|
|
@ -1,9 +0,0 @@
|
|||
cartpole-a2c-torch:
|
||||
env: CartPole-v0
|
||||
run: A2C
|
||||
stop:
|
||||
episode_reward_mean: 100
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
num_workers: 0
|
||||
use_pytorch: true
|
|
@ -1,14 +0,0 @@
|
|||
cartpole-appo-torch:
|
||||
env: CartPole-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
use_pytorch: true
|
||||
rollout_fragment_length: 10
|
||||
train_batch_size: 10
|
||||
num_envs_per_worker: 5
|
||||
num_workers: 1
|
||||
num_gpus: 0
|
||||
vtrace: false
|
|
@ -1,14 +0,0 @@
|
|||
cartpole-appo-vtrace-torch:
|
||||
env: CartPole-v0
|
||||
run: APPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 200000
|
||||
config:
|
||||
use_pytorch: true
|
||||
rollout_fragment_length: 10
|
||||
train_batch_size: 10
|
||||
num_envs_per_worker: 5
|
||||
num_workers: 1
|
||||
num_gpus: 0
|
||||
vtrace: true
|
|
@ -1,17 +0,0 @@
|
|||
cartpole-ars-tf:
|
||||
env: CartPole-v0
|
||||
run: ARS
|
||||
stop:
|
||||
episode_reward_mean: 50
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: false
|
||||
noise_stdev: 0.02
|
||||
num_rollouts: 50
|
||||
rollouts_used: 25
|
||||
num_workers: 2
|
||||
sgd_stepsize: 0.01
|
||||
noise_size: 25000000
|
||||
eval_prob: 0.5
|
||||
model:
|
||||
fcnet_hiddens: [] # a linear policy
|
|
@ -1,18 +0,0 @@
|
|||
cartpole-dqn-torch-w-param-noise:
|
||||
env: CartPole-v0
|
||||
run: DQN
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 300000
|
||||
config:
|
||||
use_pytorch: true
|
||||
exploration_config:
|
||||
type: ParameterNoise
|
||||
random_timesteps: 10000
|
||||
initial_stddev: 1.0
|
||||
batch_mode: complete_episodes
|
||||
lr: 0.0008
|
||||
num_workers: 0
|
||||
model:
|
||||
fcnet_hiddens: [32, 32]
|
||||
fcnet_activation: tanh
|
|
@ -1,10 +0,0 @@
|
|||
cartpole-dqn-torch:
|
||||
env: CartPole-v0
|
||||
run: DQN
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
config:
|
||||
use_pytorch: true
|
||||
n_step: 3
|
||||
gamma: 0.95
|
|
@ -1,11 +0,0 @@
|
|||
cartpole-es-torch:
|
||||
env: CartPole-v0
|
||||
run: ES
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: true
|
||||
num_workers: 2
|
||||
noise_size: 25000000
|
||||
episodes_per_batch: 50
|
|
@ -1,9 +0,0 @@
|
|||
cartpole-impala-torch:
|
||||
env: CartPole-v0
|
||||
run: IMPALA
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 500000
|
||||
config:
|
||||
use_pytorch: true
|
||||
num_gpus: 0
|
|
@ -1,8 +0,0 @@
|
|||
cartpole-pg-tf:
|
||||
env: CartPole-v0
|
||||
run: PG
|
||||
stop:
|
||||
episode_reward_mean: 100
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
num_workers: 0
|
|
@ -1,17 +0,0 @@
|
|||
cartpole-ppo-tf:
|
||||
env: CartPole-v0
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
||||
observation_filter: MeanStdFilter
|
||||
num_sgd_iter: 6
|
||||
vf_share_layers: true
|
||||
vf_loss_coeff: 0.01
|
||||
model:
|
||||
fcnet_hiddens: [32]
|
||||
fcnet_activation: linear
|
|
@ -1,17 +0,0 @@
|
|||
cartpole-sac-torch:
|
||||
env: CartPole-v0
|
||||
run: SAC
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
config:
|
||||
use_pytorch: true
|
||||
gamma: 0.95
|
||||
no_done_at_end: false
|
||||
target_network_update_freq: 32
|
||||
tau: 1.0
|
||||
train_batch_size: 32
|
||||
optimization:
|
||||
actor_learning_rate: 0.005
|
||||
critic_learning_rate: 0.005
|
||||
entropy_learning_rate: 0.0001
|
|
@ -1,8 +0,0 @@
|
|||
cartpole-dqn-torch:
|
||||
env: CartPole-v0
|
||||
run: SimpleQ
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
config:
|
||||
use_pytorch: true
|
|
@ -1,10 +0,0 @@
|
|||
pendulum-ddpg-tf:
|
||||
env: Pendulum-v0
|
||||
run: DDPG
|
||||
stop:
|
||||
episode_reward_mean: -700
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
use_pytorch: false
|
||||
use_huber: true
|
||||
clip_rewards: false
|
|
@ -1,10 +0,0 @@
|
|||
pendulum-ddpg-torch:
|
||||
env: Pendulum-v0
|
||||
run: DDPG
|
||||
stop:
|
||||
episode_reward_mean: -700
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
use_pytorch: true
|
||||
use_huber: true
|
||||
clip_rewards: false
|
|
@ -1,21 +0,0 @@
|
|||
pendulum-ppo-torch:
|
||||
env: Pendulum-v0
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: -500
|
||||
timesteps_total: 400000
|
||||
config:
|
||||
use_pytorch: true
|
||||
train_batch_size: 2048
|
||||
vf_clip_param: 10.0
|
||||
num_workers: 0
|
||||
num_envs_per_worker: 10
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
lr: 0.0003
|
||||
sgd_minibatch_size: 64
|
||||
num_sgd_iter: 10
|
||||
model:
|
||||
fcnet_hiddens: [64, 64]
|
||||
batch_mode: complete_episodes
|
||||
observation_filter: MeanStdFilter
|
|
@ -1,13 +0,0 @@
|
|||
pendulum-sac-tf:
|
||||
env: Pendulum-v0
|
||||
run: SAC
|
||||
stop:
|
||||
episode_reward_mean: -300 # note that evaluation perf is higher
|
||||
timesteps_total: 10000
|
||||
config:
|
||||
use_pytorch: false
|
||||
soft_horizon: true
|
||||
clip_actions: false
|
||||
normalize_actions: true
|
||||
metrics_smoothing_episodes: 5
|
||||
no_done_at_end: true
|
|
@ -1,13 +0,0 @@
|
|||
pendulum-sac-torch:
|
||||
env: Pendulum-v0
|
||||
run: SAC
|
||||
stop:
|
||||
episode_reward_mean: -300 # note that evaluation perf is higher
|
||||
timesteps_total: 10000
|
||||
config:
|
||||
use_pytorch: true
|
||||
soft_horizon: true
|
||||
clip_actions: false
|
||||
normalize_actions: true
|
||||
metrics_smoothing_episodes: 5
|
||||
no_done_at_end: true
|
|
@ -1,8 +0,0 @@
|
|||
pendulum-td3-tf:
|
||||
env: Pendulum-v0
|
||||
run: TD3
|
||||
config:
|
||||
use_pytorch: false
|
||||
stop:
|
||||
episode_reward_mean: -900
|
||||
timesteps_total: 100000
|
|
@ -10,9 +10,7 @@ atari-sac-tf-and-torch:
|
|||
stop:
|
||||
timesteps_total: 20000000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch:
|
||||
grid_search: [false, true]
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
gamma: 0.99
|
||||
# state-preprocessor=Our default Atari Conv2D-net.
|
||||
use_state_preprocessor: true
|
|
@ -1,10 +1,11 @@
|
|||
cartpole-sac-tf:
|
||||
cartpole-sac:
|
||||
env: CartPole-v0
|
||||
run: SAC
|
||||
stop:
|
||||
episode_reward_mean: 150
|
||||
timesteps_total: 50000
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.95
|
||||
no_done_at_end: false
|
|
@ -5,6 +5,7 @@ halfcheetah_sac:
|
|||
stop:
|
||||
episode_reward_mean: 9000
|
||||
config:
|
||||
use_pytorch: false # <- switch on/off torch
|
||||
horizon: 1000
|
||||
soft_horizon: false
|
||||
Q_model:
|
|
@ -8,6 +8,7 @@ mspacman-sac-tf:
|
|||
episode_reward_mean: 800
|
||||
timesteps_total: 100000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
gamma: 0.99
|
||||
# state-preprocessor=Our default Atari Conv2D-net.
|
|
@ -1,13 +1,16 @@
|
|||
# Pendulum SAC can attain -150+ reward in 6-7k
|
||||
# Configurations are the similar to original softlearning/sac codebase
|
||||
pendulum_sac:
|
||||
pendulum-sac:
|
||||
env: Pendulum-v0
|
||||
run: SAC
|
||||
stop:
|
||||
episode_reward_mean: -150
|
||||
episode_reward_mean: -300
|
||||
timesteps_total: 10000
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
use_pytorch: false
|
||||
horizon: 200
|
||||
soft_horizon: False
|
||||
soft_horizon: true
|
||||
Q_model:
|
||||
fcnet_activation: relu
|
||||
fcnet_hiddens: [256, 256]
|
||||
|
@ -16,10 +19,10 @@ pendulum_sac:
|
|||
fcnet_hiddens: [256, 256]
|
||||
tau: 0.005
|
||||
target_entropy: auto
|
||||
no_done_at_end: True
|
||||
no_done_at_end: true
|
||||
n_step: 1
|
||||
rollout_fragment_length: 1
|
||||
prioritized_replay: False
|
||||
prioritized_replay: true
|
||||
train_batch_size: 256
|
||||
target_network_update_freq: 1
|
||||
timesteps_per_iteration: 1000
|
||||
|
@ -31,6 +34,6 @@ pendulum_sac:
|
|||
num_workers: 0
|
||||
num_gpus: 0
|
||||
clip_actions: False
|
||||
normalize_actions: True
|
||||
normalize_actions: true
|
||||
evaluation_interval: 1
|
||||
metrics_smoothing_episodes: 5
|
|
@ -43,7 +43,7 @@ class PiecewiseSchedule(Schedule):
|
|||
assert idxes == sorted(idxes)
|
||||
self.interpolation = interpolation
|
||||
self.outside_value = outside_value
|
||||
self.endpoints = endpoints
|
||||
self.endpoints = [(int(e[0]), float(e[1])) for e in endpoints]
|
||||
|
||||
@override(Schedule)
|
||||
def _value(self, t):
|
||||
|
|
Loading…
Add table
Reference in a new issue