[RLlib] DDPG refactor and Exploration API action noise classes. (#7314)

* WIP.

* WIP.

* WIP.

* WIP.

* WIP.

* Fix

* WIP.

* Add TD3 quick Pendulum regresison.

* Cleanup.

* Fix.

* LINT.

* Fix.

* Sort quick_learning test cases, add TD3.

* Sort quick_learning test cases, add TD3.

* Revert test_checkpoint_restore.py (debugging) changes.

* Fix old soft_q settings in documentation and test configs.

* More doc fixes.

* Fix test case.

* Fix test case.

* Lower test load.

* WIP.
This commit is contained in:
Sven Mika 2020-03-01 20:53:35 +01:00 committed by GitHub
parent 3c6b94f3f5
commit 83e06cd30a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 1294 additions and 777 deletions

View file

@ -45,8 +45,7 @@ Then, we can tell DQN to train using these previously generated experiences with
--config='{
"input": "/tmp/cartpole-out",
"input_evaluation": [],
"exploration_final_eps": 0,
"exploration_fraction": 0}'
"explore": false}'
**Off-policy estimation:** Since the input experiences are not from running simulations, RLlib cannot report the true policy performance during training. However, you can use ``tensorboard --logdir=~/ray_results`` to monitor training progress via other metrics such as estimated Q-value. Alternatively, `off-policy estimation <https://arxiv.org/pdf/1511.03722.pdf>`__ can be used, which requires both the source and target action probabilities to be available (i.e., the ``action_prob`` batch key). For DQN, this means enabling soft Q learning so that actions are sampled from a probability distribution:
@ -58,8 +57,10 @@ Then, we can tell DQN to train using these previously generated experiences with
--config='{
"input": "/tmp/cartpole-out",
"input_evaluation": ["is", "wis"],
"soft_q": true,
"softmax_temp": 1.0}'
"exploration_config": {
"type": "SoftQ",
"temperature": 1.0,
}'
This example plot shows the Q-value metric in addition to importance sampling (IS) and weighted importance sampling (WIS) gain estimates (>1.0 means there is an estimated improvement over the original policy):
@ -121,8 +122,7 @@ RLlib supports multiplexing inputs from multiple input sources, including simula
"hdfs:/archive/cartpole": 0.3,
"sampler": 0.3,
},
"exploration_final_eps": 0,
"exploration_fraction": 0}'
"explore": false}'
Scaling I/O throughput
~~~~~~~~~~~~~~~~~~~~~~

View file

@ -58,6 +58,22 @@ py_test(
# Tag: agents_dir
# --------------------------------------------------------------------
# A2CTrainer
py_test(
name = "test_a2c",
tags = ["agents_dir"],
size = "small",
srcs = ["agents/a3c/tests/test_a2c.py"]
)
# DDPGTrainer
py_test(
name = "test_ddpg",
tags = ["agents_dir"],
size = "medium",
srcs = ["agents/ddpg/tests/test_ddpg.py"]
)
# DQNTrainer
py_test(
name = "test_dqn",
@ -66,12 +82,12 @@ py_test(
srcs = ["agents/dqn/tests/test_dqn.py"]
)
# A2CTrainer
# IMPALA
py_test(
name = "test_a2c",
name = "test_vtrace",
tags = ["agents_dir"],
size = "small",
srcs = ["agents/a3c/tests/test_a2c.py"]
srcs = ["agents/impala/tests/test_vtrace.py"]
)
# PGTrainer
@ -91,12 +107,12 @@ py_test(
"agents/ppo/tests/test.py"] # TODO(sven): Move down once PR 6889 merged
)
# IMPALA
# TD3Trainer
py_test(
name = "test_vtrace",
name = "test_td3",
tags = ["agents_dir"],
size = "small",
srcs = ["agents/impala/tests/test_vtrace.py"]
size = "medium",
srcs = ["agents/ddpg/tests/test_td3.py"]
)
# --------------------------------------------------------------------
@ -255,6 +271,390 @@ py_test(
]
)
# DDPG/APEX-DDPG/TD3
py_test(
name = "test_ddpg_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 1}'"
]
)
py_test(
name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "MountainCarContinuous-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 0}'"
]
)
py_test(
name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "MountainCarContinuous-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 1}'"
]
)
py_test(
name = "test_apex_ddpg_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "APEX_DDPG",
"--ray-num-cpus", "8",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "APEX_DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_td3_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "TD3",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 1}'"
]
)
# DQN/APEX
py_test(
name = "test_dqn_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "FrozenLake-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'"
]
)
py_test(
name = "test_dqn_cartpole_v0_no_dueling",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"lr\": 1e-3, \"exploration_config\": {\"epsilon_timesteps\": 10000, \"final_epsilon\": 0.02}, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
]
)
py_test(
name = "test_dqn_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train", "external_files"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"exploration_config\": {\"type\": \"SoftQ\"}}'"
]
)
py_test(
name = "test_dqn_pong_deterministic_v4",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "PongDeterministic-v4",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"lr\": 1e-4, \"exploration_config\": {\"epsilon_timesteps\": 200000, \"final_epsilon\": 0.01}, \"buffer_size\": 10000, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
]
)
py_test(
name = "test_apex_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "APEX",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4"
]
)
# ES
py_test(
name = "test_es_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "ES",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_es_pong_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pong-v0",
"--run", "ES",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
# IMPALA
py_test(
name = "test_impala_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_cartpole_v0_num_aggregation_workers_2",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "5",
]
)
py_test(
name = "test_impala_cartpole_v0_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_buffers_2",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_cartpole_v0_buffers_2_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "PongDeterministic-v4",
"--run", "IMPALA",
"--stop", "'{\"timesteps_total\": 40000}'",
"--ray-object-store-memory=1000000000",
"--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
]
)
# From test_rollout.sh (deprecated test file).
py_test(
name = "test_impala_rollout",
main = "tests/test_rollout.py",
data = ["train.py", "rollout.py"],
tags = ["quick_train"],
srcs = ["tests/test_rollout.py"]
)
# MARWIL
py_test(
name = "test_marwil_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train", "external_files"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
args = [
"--env", "CartPole-v0",
"--run", "MARWIL",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
]
)
# PG
py_test(
name = "test_pg_tf_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "FrozenLake-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_torch_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--torch",
"--env", "FrozenLake-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_torch_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--torch",
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
]
)
py_test(
name = "test_pg_tf_pong_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pong-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
# PPO/APPO
py_test(
@ -424,378 +824,6 @@ py_test(
]
)
# ES
py_test(
name = "test_es_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "ES",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_es_pong_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pong-v0",
"--run", "ES",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
# DQN/APEX
py_test(
name = "test_dqn_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "FrozenLake-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'"
]
)
py_test(
name = "test_dqn_cartpole_v0_no_dueling",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"lr\": 1e-3, \"schedule_max_timesteps\": 100000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.02, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
]
)
py_test(
name = "test_dqn_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train", "external_files"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
args = [
"--env", "CartPole-v0",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"soft_q\": true}'"
]
)
py_test(
name = "test_dqn_pong_deterministic_v4",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "PongDeterministic-v4",
"--run", "DQN",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"lr\": 1e-4, \"schedule_max_timesteps\": 2000000, \"buffer_size\": 10000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.01, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
]
)
py_test(
name = "test_apex_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "APEX",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4"
]
)
# PG
py_test(
name = "test_pg_tf_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "FrozenLake-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_torch_frozenlake_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--torch",
"--env", "FrozenLake-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
py_test(
name = "test_pg_torch_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--torch",
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
]
)
py_test(
name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
main = "train.py", srcs = ["train.py"],
size = "small",
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
]
)
py_test(
name = "test_pg_tf_pong_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pong-v0",
"--run", "PG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
]
)
# DDPG/APEX-DDPG
py_test(
name = "test_ddpg_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 1}'"
]
)
py_test(
name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "MountainCarContinuous-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 0}'"
]
)
py_test(
name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "MountainCarContinuous-v0",
"--run", "DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 1}'"
]
)
py_test(
name = "test_apex_ddpg_pendulum_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "APEX_DDPG",
"--ray-num-cpus", "8",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4"
]
)
py_test(
name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "Pendulum-v0",
"--run", "APEX_DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
"--ray-num-cpus", "4",
]
)
# IMPALA
py_test(
name = "test_impala_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_cartpole_v0_num_aggregation_workers_2",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
"--ray-num-cpus", "5",
]
)
py_test(
name = "test_impala_cartpole_v0_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_buffers_2",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_cartpole_v0_buffers_2_lstm",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
py_test(
name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train"],
args = [
"--env", "PongDeterministic-v4",
"--run", "IMPALA",
"--stop", "'{\"timesteps_total\": 40000}'",
"--ray-object-store-memory=1000000000",
"--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
]
)
# From test_rollout.sh (deprecated test file).
py_test(
name = "test_impala_rollout",
main = "tests/test_rollout.py",
data = ["train.py", "rollout.py"],
tags = ["quick_train"],
srcs = ["tests/test_rollout.py"]
)
# MARWIL
py_test(
name = "test_marwil_cartpole_v0",
main = "train.py", srcs = ["train.py"],
tags = ["quick_train", "external_files"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
args = [
"--env", "CartPole-v0",
"--run", "MARWIL",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
]
)
# --------------------------------------------------------------------
# Models and Distributions
# rllib/models/

View file

@ -1,12 +1,5 @@
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ddpg.td3 import TD3Trainer
from ray.rllib.utils import renamed_agent
ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
DDPGAgent = renamed_agent(DDPGTrainer)
__all__ = [
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
"TD3Trainer", "DEFAULT_CONFIG"
]
__all__ = ["ApexDDPGTrainer", "DDPGTrainer", "DEFAULT_CONFIG", "TD3Trainer"]

View file

@ -1,17 +1,18 @@
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
DEFAULT_CONFIG as DDPG_CONFIG
from ray.rllib.utils import merge_dicts
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
DDPG_CONFIG, # see also the options in ddpg.py, which are also supported
{
"optimizer": merge_dicts(
DDPG_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"optimizer": {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
},
"exploration_config": {
"type": "PerWorkerOrnsteinUhlenbeckNoise"
},
"n_step": 3,
"num_gpus": 0,
"num_workers": 32,
@ -21,7 +22,6 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
"sample_batch_size": 50,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"worker_side_prioritization": True,
"min_iter_time_s": 30,
},

View file

@ -1,7 +1,10 @@
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
from ray.rllib.utils.deprecation import deprecation_warning, \
DEPRECATED_VALUE
from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
PerWorkerOrnsteinUhlenbeckNoise
# yapf: disable
# __sphinx_doc_begin__
@ -55,49 +58,35 @@ DEFAULT_CONFIG = with_common_config({
"n_step": 1,
# === Exploration ===
# Turns on annealing schedule for exploration noise. Exploration is
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
# anneal noise, so this is False by default.
"exploration_should_anneal": False,
# Max num timesteps for annealing schedules.
"schedule_max_timesteps": 100000,
"exploration_config": {
# DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
# actions (after a possible pure random phase of n timesteps).
"type": "OrnsteinUhlenbeckNoise",
# For how many timesteps should we return completely random actions,
# before we start adding (scaled) noise?
"random_timesteps": 1000,
# The OU-base scaling factor to always apply to action-added noise.
"ou_base_scale": 0.1,
# The OU theta param.
"ou_theta": 0.15,
# The OU sigma param.
"ou_sigma": 0.2,
# The initial noise scaling factor.
"initial_scale": 1.0,
# The final noise scaling factor.
"final_scale": 1.0,
# Timesteps over which to anneal scale (from initial to final values).
"scale_timesteps": 10000,
},
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
"exploration_fraction": 0.1,
# Final scaling multiplier for action noise (initial is 1.0)
"exploration_final_scale": 0.02,
# valid values: "ou" (time-correlated, like original DDPG paper),
# "gaussian" (IID, like TD3 paper)
"exploration_noise_type": "ou",
# OU-noise scale; this can be used to scale down magnitude of OU noise
# before adding to actions (requires "exploration_noise_type" to be "ou")
"exploration_ou_noise_scale": 0.1,
# theta for OU
"exploration_ou_theta": 0.15,
# sigma for OU
"exploration_ou_sigma": 0.2,
# gaussian stddev of act noise for exploration (requires
# "exploration_noise_type" to be "gaussian")
"exploration_gaussian_sigma": 0.1,
# If True parameter space noise will be used for exploration
# See https://blog.openai.com/better-exploration-with-parameter-noise/
"parameter_noise": False,
# Until this many timesteps have elapsed, the agent's policy will be
# ignored & it will instead take uniform random actions. Can be used in
# conjunction with learning_starts (which controls when the first
# optimization step happens) to decrease dependence of exploration &
# optimization on initial policy parameters. Note that this will be
# disabled when the action noise scale is set to 0 (e.g during evaluation).
"pure_exploration_steps": 1000,
# Extra configuration that disables exploration.
"evaluation_config": {
"exploration_fraction": 0,
"exploration_final_eps": 0,
"explore": False
},
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
@ -150,8 +139,6 @@ DEFAULT_CONFIG = with_common_config({
# to increase if your environment is particularly slow to sample, or if
# you're using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to use a distribution of epsilons across workers for exploration.
"per_worker_exploration": False,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
@ -161,76 +148,47 @@ DEFAULT_CONFIG = with_common_config({
# yapf: enable
def make_exploration_schedule(config, worker_index):
# Modification of DQN's schedule to take into account
# `exploration_ou_noise_scale`
if config["per_worker_exploration"]:
assert config["num_workers"] > 1, "This requires multiple workers"
if worker_index >= 0:
# FIXME: what do magic constants mean? (0.4, 7)
max_index = float(config["num_workers"] - 1)
exponent = 1 + worker_index / max_index * 7
return ConstantSchedule(0.4**exponent)
else:
# local ev should have zero exploration so that eval rollouts
# run properly
return ConstantSchedule(0.0)
elif config["exploration_should_anneal"]:
return PiecewiseSchedule(
endpoints=[(0, 1.0), (int(config["exploration_fraction"] *
config["schedule_max_timesteps"]),
config["exploration_final_scale"])],
outside_value=config["exploration_final_scale"])
else:
# *always* add exploration noise
return ConstantSchedule(1.0)
def setup_ddpg_exploration(trainer):
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
trainer.explorations = [
make_exploration_schedule(trainer.config, i)
for i in range(trainer.config["num_workers"])
]
def update_worker_explorations(trainer):
global_timestep = trainer.optimizer.num_steps_sampled
exp_vals = [trainer.exploration0.value(global_timestep)]
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.set_epsilon(exp_vals[0]))
for i, e in enumerate(trainer.workers.remote_workers()):
exp_val = trainer.explorations[i].value(global_timestep)
e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
exp_vals.append(exp_val)
trainer.train_start_timestep = global_timestep
trainer.exploration_infos = exp_vals
def add_pure_exploration_phase(trainer):
global_timestep = trainer.optimizer.num_steps_sampled
pure_expl_steps = trainer.config["pure_exploration_steps"]
if pure_expl_steps:
# tell workers whether they should do pure exploration
only_explore = global_timestep < pure_expl_steps
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.set_pure_exploration_phase(only_explore))
for e in trainer.workers.remote_workers():
e.foreach_trainable_policy.remote(
lambda p, _: p.set_pure_exploration_phase(only_explore))
update_worker_explorations(trainer)
def validate_config(config):
# PyTorch check.
if config["use_pytorch"]:
raise ValueError("DDPG does not support PyTorch yet! Use tf instead.")
# TODO(sven): Remove at some point.
# Backward compatibility of noise-based exploration config.
schedule_max_timesteps = None
if config.get("schedule_max_timesteps", DEPRECATED_VALUE) != \
DEPRECATED_VALUE:
deprecation_warning("schedule_max_timesteps",
"exploration_config.scale_timesteps")
schedule_max_timesteps = config["schedule_max_timesteps"]
if config.get("exploration_final_scale", DEPRECATED_VALUE) != \
DEPRECATED_VALUE:
deprecation_warning("exploration_final_scale",
"exploration_config.final_scale")
if isinstance(config["exploration_config"], dict):
config["exploration_config"]["final_scale"] = \
config.pop("exploration_final_scale")
if config.get("exploration_fraction", DEPRECATED_VALUE) != \
DEPRECATED_VALUE:
assert schedule_max_timesteps is not None
deprecation_warning("exploration_fraction",
"exploration_config.scale_timesteps")
if isinstance(config["exploration_config"], dict):
config["exploration_config"]["scale_timesteps"] = config.pop(
"exploration_fraction") * schedule_max_timesteps
if config.get("per_worker_exploration", DEPRECATED_VALUE) != \
DEPRECATED_VALUE:
deprecation_warning(
"per_worker_exploration",
"exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise")
if isinstance(config["exploration_config"], dict):
config["exploration_config"]["type"] = \
PerWorkerOrnsteinUhlenbeckNoise
DDPGTrainer = GenericOffPolicyTrainer.with_updates(
name="DDPG",
default_config=DEFAULT_CONFIG,
default_policy=DDPGTFPolicy,
validate_config=validate_config,
before_init=setup_ddpg_exploration,
before_train_step=add_pure_exploration_phase)
)

View file

@ -7,6 +7,7 @@ from ray.rllib.agents.dqn.dqn_policy import postprocess_nstep_and_prio
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_action_dist import Deterministic
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.policy.policy import Policy
@ -42,20 +43,24 @@ class DDPGPostprocessing:
list(x) for x in sample_batch.columns(
[SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
]
self.sess.run(self.remove_noise_op)
clean_actions = self.sess.run(
self.output_actions,
self.sess.run(self.remove_parameter_noise_op)
# TODO(sven): This won't work if exploration != Noise, which is
# probably fine as parameter_noise will soon be its own
# Exploration class.
clean_actions, cur_noise_scale = self.sess.run(
[self.output_actions,
self.exploration.get_info()],
feed_dict={
self.cur_observations: states,
self.stochastic: False,
self.noise_scale: .0,
self.pure_exploration_phase: False,
self._is_exploring: False,
})
distance_in_action_space = np.sqrt(
np.mean(np.square(clean_actions - noisy_actions)))
self.pi_distance = distance_in_action_space
if distance_in_action_space < \
self.config["exploration_ou_sigma"] * self.cur_noise_scale:
self.config["exploration_config"].get("ou_sigma", 0.2) * \
cur_noise_scale:
# multiplying the sampled OU noise by noise scale is
# equivalent to multiplying the sigma of OU by noise scale
self.parameter_noise_sigma_val *= 1.01
@ -82,14 +87,11 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
"using a Tuple action space, or the multi-agent API.")
self.config = config
self.cur_noise_scale = 1.0
self.cur_pure_exploration_phase = False
self.dim_actions = action_space.shape[0]
self.low_action = action_space.low
self.high_action = action_space.high
# create global step for counting the number of update operations
# Create global step for counting the number of update operations.
self.global_step = tf.train.get_or_create_global_step()
# Create sampling timestep placeholder.
timestep = tf.placeholder(tf.int32, (), name="timestep")
# use separate optimizers for actor & critic
self._actor_optimizer = tf.train.AdamOptimizer(
@ -97,11 +99,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
self._critic_optimizer = tf.train.AdamOptimizer(
learning_rate=self.config["critic_lr"])
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
self.pure_exploration_phase = tf.placeholder(
tf.bool, (), name="pure_exploration_phase")
# Observation inputs.
self.cur_observations = tf.placeholder(
tf.float32,
shape=(None, ) + observation_space.shape,
@ -118,19 +116,14 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
var for var in self.policy_vars if "LayerNorm" not in var.name
])
# Create exploration component.
self.exploration = self._create_exploration(action_space, config)
explore = tf.placeholder_with_default(True, (), name="is_exploring")
# Action outputs
with tf.variable_scope(ACTION_SCOPE):
self.output_actions = self._add_exploration_noise(
policy_out, self.stochastic, self.noise_scale,
self.pure_exploration_phase, action_space)
if self.config["smooth_target_policy"]:
self.reset_noise_op = tf.no_op()
else:
with tf.variable_scope(ACTION_SCOPE, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
self.reset_noise_op = tf.assign(exploration_sample,
self.dim_actions * [.0])
self.output_actions, _ = self.exploration.get_exploration_action(
policy_out, Deterministic, self.policy_model, explore,
timestep)
# Replay inputs
self.obs_t = tf.placeholder(
@ -294,7 +287,9 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
sampled_action=self.output_actions,
loss=self.actor_loss + self.critic_loss,
loss_inputs=self.loss_inputs,
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops,
explore=explore,
timestep=timestep)
self.sess.run(tf.global_variables_initializer())
# Note that this encompasses both the policy and Q-value networks and
@ -364,16 +359,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
+ self._critic_grads_and_vars
return grads_and_vars
@override(TFPolicy)
def extra_compute_action_feed_dict(self):
return {
# FIXME: what about turning off exploration? Isn't that a good
# idea?
self.stochastic: True,
self.noise_scale: self.cur_noise_scale,
self.pure_exploration_phase: self.cur_pure_exploration_phase,
}
@override(TFPolicy)
def extra_compute_grad_fetches(self):
return {
@ -389,19 +374,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
def set_weights(self, weights):
self.variables.set_weights(weights)
@override(Policy)
def get_state(self):
return [
TFPolicy.get_state(self), self.cur_noise_scale,
self.cur_pure_exploration_phase
]
@override(Policy)
def set_state(self, state):
TFPolicy.set_state(self, state[0])
self.set_epsilon(state[1])
self.set_pure_exploration_phase(state[2])
def _build_q_network(self, obs, obs_space, action_space, actions):
if self.config["use_state_preprocessor"]:
q_model = ModelCatalog.get_model({
@ -444,7 +416,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
action_out = tf.layers.dense(
action_out, units=hidden, activation=activation)
action_out = tf.layers.dense(
action_out, units=self.dim_actions, activation=None)
action_out, units=action_space.shape[0], activation=None)
# Use sigmoid to scale to [0,1], but also double magnitude of input to
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
@ -458,81 +430,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
return actions, model
def _add_exploration_noise(self, deterministic_actions,
should_be_stochastic, noise_scale,
enable_pure_exploration, action_space):
noise_type = self.config["exploration_noise_type"]
action_low = action_space.low
action_high = action_space.high
action_range = action_space.high - action_low
def compute_stochastic_actions():
def make_noisy_actions():
# shape of deterministic_actions is [None, dim_action]
if noise_type == "gaussian":
# add IID Gaussian noise for exploration, TD3-style
normal_sample = noise_scale * tf.random_normal(
tf.shape(deterministic_actions),
stddev=self.config["exploration_gaussian_sigma"])
stochastic_actions = tf.clip_by_value(
deterministic_actions + normal_sample,
action_low * tf.ones_like(deterministic_actions),
action_high * tf.ones_like(deterministic_actions))
elif noise_type == "ou":
# add OU noise for exploration, DDPG-style
zero_acts = action_low.size * [.0]
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=zero_acts,
trainable=False)
normal_sample = tf.random_normal(
shape=[action_low.size], mean=0.0, stddev=1.0)
ou_new = self.config["exploration_ou_theta"] \
* -exploration_sample \
+ self.config["exploration_ou_sigma"] * normal_sample
exploration_value = tf.assign_add(exploration_sample,
ou_new)
base_scale = self.config["exploration_ou_noise_scale"]
noise = noise_scale * base_scale \
* exploration_value * action_range
stochastic_actions = tf.clip_by_value(
deterministic_actions + noise,
action_low * tf.ones_like(deterministic_actions),
action_high * tf.ones_like(deterministic_actions))
else:
raise ValueError(
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
noise_type)
return stochastic_actions
def make_uniform_random_actions():
# pure random exploration option
uniform_random_actions = tf.random_uniform(
tf.shape(deterministic_actions))
# rescale uniform random actions according to action range
tf_range = tf.constant(action_range[None], dtype="float32")
tf_low = tf.constant(action_low[None], dtype="float32")
uniform_random_actions = uniform_random_actions * tf_range \
+ tf_low
return uniform_random_actions
stochastic_actions = tf.cond(
# need to condition on noise_scale > 0 because zeroing
# noise_scale is how a worker signals no noise should be used
# (this is ugly and should be fixed by adding an "eval_mode"
# config flag or something)
tf.logical_and(enable_pure_exploration, noise_scale > 0),
true_fn=make_uniform_random_actions,
false_fn=make_noisy_actions)
return stochastic_actions
enable_stochastic = tf.logical_and(should_be_stochastic,
not self.config["parameter_noise"])
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
lambda: deterministic_actions)
return actions
def _build_actor_critic_loss(self,
q_t,
q_tp1,
@ -580,7 +477,8 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
return critic_loss, actor_loss, td_error
def _build_parameter_noise(self, pnet_params):
self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
self.parameter_noise_sigma_val = \
self.config["exploration_config"].get("ou_sigma", 0.2)
self.parameter_noise_sigma = tf.get_variable(
initializer=tf.constant_initializer(
self.parameter_noise_sigma_val),
@ -600,7 +498,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
remove_noise_ops = list()
for var, var_noise in zip(pnet_params, self.parameter_noise):
remove_noise_ops.append(tf.assign_add(var, -var_noise))
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
self.remove_parameter_noise_op = tf.group(*tuple(remove_noise_ops))
generate_noise_ops = list()
for var_noise in self.parameter_noise:
generate_noise_ops.append(
@ -630,9 +528,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
})
return td_err
def reset_noise(self, sess):
sess.run(self.reset_noise_op)
def add_parameter_noise(self):
if self.config["parameter_noise"]:
self.sess.run(self.add_noise_op)
@ -642,13 +537,3 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
tau = tau or self.tau_value
return self.sess.run(
self.update_target_expr, feed_dict={self.tau: tau})
def set_epsilon(self, epsilon):
# set_epsilon is called by optimizer to anneal exploration as
# necessary, and to turn it off during evaluation. The "epsilon" part
# is a carry-over from DQN, which uses epsilon-greedy exploration
# rather than adding action noise to the output of a policy network.
self.cur_noise_scale = epsilon
def set_pure_exploration_phase(self, pure_exploration_phase):
self.cur_pure_exploration_phase = pure_exploration_phase

View file

@ -3,12 +3,10 @@
By default, this uses a near-identical configuration to that reported in the
TD3 paper.
"""
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
DEFAULT_CONFIG as DDPG_CONFIG
from ray.rllib.utils import merge_dicts
TD3_DEFAULT_CONFIG = merge_dicts(
TD3_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
DDPG_CONFIG,
{
# largest changes: twin Q functions, delayed policy updates, and target
@ -18,15 +16,27 @@ TD3_DEFAULT_CONFIG = merge_dicts(
"smooth_target_policy": True,
"target_noise": 0.2,
"target_noise_clip": 0.5,
"exploration_config": {
# TD3 uses simple Gaussian noise on top of deterministic NN-output
# actions (after a possible pure random phase of n timesteps).
"type": "GaussianNoise",
# For how many timesteps should we return completely random
# actions, before we start adding (scaled) noise?
"random_timesteps": 10000,
# Gaussian stddev of action noise for exploration.
"stddev": 0.1,
# Scaling settings by which the Gaussian noise is scaled before
# being added to the actions. NOTE: The scale timesteps start only
# after(!) any random steps have been finished.
# By default, do not anneal over time (fixed 1.0).
"initial_scale": 1.0,
"final_scale": 1.0,
"scale_timesteps": 1
},
# other changes & things we want to keep fixed: IID Gaussian
# exploration noise, larger actor learning rate, no l2 regularisation,
# no Huber loss, etc.
"exploration_should_anneal": False,
"exploration_noise_type": "gaussian",
"exploration_gaussian_sigma": 0.1,
# other changes & things we want to keep fixed:
# larger actor learning rate, no l2 regularisation, no Huber loss, etc.
"learning_starts": 10000,
"pure_exploration_steps": 10000,
"actor_hiddens": [400, 300],
"critic_hiddens": [400, 300],
"n_step": 1,
@ -40,14 +50,12 @@ TD3_DEFAULT_CONFIG = merge_dicts(
"target_network_update_freq": 0,
"num_workers": 0,
"num_gpus_per_worker": 0,
"per_worker_exploration": False,
"worker_side_prioritization": False,
"buffer_size": 1000000,
"prioritized_replay": False,
"clip_rewards": False,
"use_state_preprocessor": False,
},
)
})
TD3Trainer = DDPGTrainer.with_updates(
name="TD3", default_config=TD3_DEFAULT_CONFIG)

View file

@ -0,0 +1,87 @@
import numpy as np
import unittest
import ray.rllib.agents.ddpg as ddpg
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check
tf = try_import_tf()
class TestDDPG(unittest.TestCase):
def test_ddpg_compilation(self):
"""Test whether a DDPGTrainer can be built with both frameworks."""
config = ddpg.DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
# Test against all frameworks.
for fw in ["tf", "eager", "torch"]:
if fw != "tf":
continue
config["eager"] = True if fw == "eager" else False
config["use_pytorch"] = True if fw == "torch" else False
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
num_iterations = 2
for i in range(num_iterations):
results = trainer.train()
print(results)
def test_ddpg_exploration_and_with_random_prerun(self):
"""Tests DDPG's Exploration (w/ random actions for n timesteps)."""
config = ddpg.DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
obs = np.array([0.0, 0.1, -0.1])
# Test against all frameworks.
for fw in ["tf", "eager", "torch"]:
if fw != "tf":
continue
config["eager"] = True if fw == "eager" else False
config["use_pytorch"] = True if fw == "torch" else False
# Default OUNoise setup.
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
# Setting explore=False should always return the same action.
a_ = trainer.compute_action(obs, explore=False)
for _ in range(50):
a = trainer.compute_action(obs, explore=False)
check(a, a_)
# explore=None (default: explore) should return different actions.
actions = []
for _ in range(50):
actions.append(trainer.compute_action(obs))
check(np.std(actions), 0.0, false=True)
# Check randomness at beginning.
config["exploration_config"] = {
# Act randomly at beginning ...
"random_timesteps": 50,
# Then act very closely to deterministic actions thereafter.
"ou_base_scale": 0.001,
"initial_scale": 0.001,
"final_scale": 0.001,
}
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
# ts=1 (get a deterministic action as per explore=False).
deterministic_action = trainer.compute_action(obs, explore=False)
# ts=2-5 (in random window).
random_a = []
for _ in range(49):
random_a.append(trainer.compute_action(obs, explore=True))
check(random_a[-1], deterministic_action, false=True)
self.assertTrue(np.std(random_a) > 0.5)
# ts > 50 (a=deterministic_action + scale * N[0,1])
for _ in range(50):
a = trainer.compute_action(obs, explore=True)
check(a, deterministic_action, rtol=0.1)
# ts >> 50 (BUT: explore=False -> expect deterministic action).
for _ in range(50):
a = trainer.compute_action(obs, explore=False)
check(a, deterministic_action)
if __name__ == "__main__":
import unittest
unittest.main(verbosity=1)

View file

@ -0,0 +1,87 @@
import numpy as np
import unittest
import ray.rllib.agents.ddpg.td3 as td3
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check
tf = try_import_tf()
class TestTD3(unittest.TestCase):
def test_td3_compilation(self):
"""Test whether a TD3Trainer can be built with both frameworks."""
config = td3.TD3_DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
# Test against all frameworks.
for fw in ["tf", "eager", "torch"]:
if fw != "tf":
continue
config["eager"] = True if fw == "eager" else False
config["use_pytorch"] = True if fw == "torch" else False
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
num_iterations = 2
for i in range(num_iterations):
results = trainer.train()
print(results)
def test_td3_exploration_and_with_random_prerun(self):
"""Tests TD3's Exploration (w/ random actions for n timesteps)."""
config = td3.TD3_DEFAULT_CONFIG.copy()
config["num_workers"] = 0 # Run locally.
obs = np.array([0.0, 0.1, -0.1])
# Test against all frameworks.
for fw in ["tf", "eager", "torch"]:
if fw != "tf":
continue
config["eager"] = True if fw == "eager" else False
config["use_pytorch"] = True if fw == "torch" else False
# Default GaussianNoise setup.
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
# Setting explore=False should always return the same action.
a_ = trainer.compute_action(obs, explore=False)
for _ in range(50):
a = trainer.compute_action(obs, explore=False)
check(a, a_)
# explore=None (default: explore) should return different actions.
actions = []
for _ in range(50):
actions.append(trainer.compute_action(obs))
check(np.std(actions), 0.0, false=True)
# Check randomness at beginning.
config["exploration_config"] = {
# Act randomly at beginning ...
"random_timesteps": 30,
# Then act very closely to deterministic actions thereafter.
"stddev": 0.001,
"initial_scale": 0.001,
"final_scale": 0.001,
}
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
# ts=1 (get a deterministic action as per explore=False).
deterministic_action = trainer.compute_action(obs, explore=False)
# ts=2-5 (in random window).
random_a = []
for _ in range(29):
random_a.append(trainer.compute_action(obs, explore=True))
check(random_a[-1], deterministic_action, false=True)
self.assertTrue(np.std(random_a) > 0.5)
# ts > 30 (a=deterministic_action + scale * N[0,1])
for _ in range(50):
a = trainer.compute_action(obs, explore=True)
check(a, deterministic_action, rtol=0.1)
# ts >> 30 (BUT: explore=False -> expect deterministic action).
for _ in range(50):
a = trainer.compute_action(obs, explore=False)
check(a, deterministic_action)
if __name__ == "__main__":
import unittest
unittest.main(verbosity=1)

View file

@ -211,7 +211,7 @@ def validate_config_and_setup_param_noise(config):
if config.get("soft_q", DEPRECATED_VALUE) != DEPRECATED_VALUE:
deprecation_warning(
"soft_q", "exploration_config={"
"type=StochasticSampling, temperature=[float]"
"type=SoftQ, temperature=[float]"
"}")
config["exploration_config"] = {
"type": "SoftQ",

View file

@ -15,14 +15,14 @@ class TestDQN(unittest.TestCase):
config["num_workers"] = 0 # Run locally.
# tf.
config["eager"] = True
config["eager"] = False
trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
num_iterations = 2
for i in range(num_iterations):
results = trainer.train()
print(results)
config["eager"] = False
config["eager"] = True
trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
num_iterations = 2
for i in range(num_iterations):

View file

@ -550,14 +550,11 @@ class Trainer(Trainable):
else:
self.env_creator = lambda env_config: None
# Merge the supplied config with the class default.
merged_config = copy.deepcopy(self._default_config)
merged_config = deep_update(merged_config, config,
self._allow_unknown_configs,
self._allow_unknown_subkeys,
self._override_all_subkeys_if_type_changes)
# Merge the supplied config with the class default, but store the
# user-provided one.
self.raw_user_config = config
self.config = merged_config
self.config = Trainer.merge_trainer_configs(self._default_config,
config)
if self.config["normalize_actions"]:
inner = self.env_creator
@ -767,8 +764,7 @@ class Trainer(Trainable):
preprocessed, update=False)
# Figure out the current (sample) time step and pass it into Policy.
timestep = self.optimizer.num_steps_sampled \
if self._has_policy_optimizer() else None
self.global_vars["timestep"] += 1
result = self.get_policy(policy_id).compute_single_action(
filtered_obs,
@ -778,7 +774,7 @@ class Trainer(Trainable):
info,
clip_actions=self.config["clip_actions"],
explore=explore,
timestep=timestep)
timestep=self.global_vars["timestep"])
if state or full_fetch:
return result
@ -878,6 +874,13 @@ class Trainer(Trainable):
"the DEFAULT_CONFIG defined by each agent for more info.\n\n"
"The config of this agent is: {}".format(config))
@classmethod
def merge_trainer_configs(cls, config1, config2):
config1 = copy.deepcopy(config1)
return deep_update(config1, config2, cls._allow_unknown_configs,
cls._allow_unknown_subkeys,
cls._override_all_subkeys_if_type_changes)
@staticmethod
def _validate_config(config):
if "policy_graphs" in config["multiagent"]:

View file

@ -266,7 +266,8 @@ class SquashedGaussian(TFActionDistribution):
class Deterministic(TFActionDistribution):
"""Action distribution that returns the input values directly.
This is similar to DiagGaussian with standard deviation zero.
This is similar to DiagGaussian with standard deviation zero (thus only
requiring the "mean" values as NN output).
"""
@override(ActionDistribution)

View file

@ -83,8 +83,8 @@ class OffPolicyEstimator:
"Off-policy estimation is not possible unless the inputs "
"include action probabilities (i.e., the policy is stochastic "
"and emits the 'action_prob' key). For DQN this means using "
"`soft_q: True`. You can also set `input_evaluation: []` to "
"disable estimation.")
"`exploration_config: {type: 'SoftQ'}`. You can also set "
"`input_evaluation: []` to disable estimation.")
@DeveloperAPI
def get_metrics(self):

View file

@ -37,7 +37,7 @@ def _convert_to_numpy(x):
if x is None:
return None
try:
return x.numpy()
return tf.nest.map_structure(lambda component: component.numpy(), x)
except AttributeError:
raise TypeError(
("Object of type {} has no method to convert to numpy.").format(
@ -402,6 +402,10 @@ def build_eager_tf_policy(name,
zip([(tf.convert_to_tensor(g) if g is not None else None)
for g in gradients], self.model.trainable_variables()))
@override(Policy)
def get_exploration_info(self):
return _convert_to_numpy(self.exploration.get_info())
@override(Policy)
def get_weights(self):
variables = self.variables()

View file

@ -13,7 +13,6 @@ from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils.annotations import override, DeveloperAPI
from ray.rllib.utils.debug import summarize
from ray.rllib.utils.exploration.exploration import Exploration
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
from ray.rllib.utils.tf_run_builder import TFRunBuilder
@ -332,8 +331,7 @@ class TFPolicy(Policy):
@override(Policy)
def get_exploration_info(self):
if isinstance(self.exploration, Exploration):
return self._sess.run(self.exploration_info)
return self._sess.run(self.exploration_info)
@override(Policy)
def get_weights(self):

View file

@ -20,16 +20,22 @@ def get_mean_action(alg, obs):
ray.init(num_cpus=10, object_store_memory=1e9)
CONFIGS = {
"SAC": {},
"SAC": {
"explore": False,
},
"ES": {
"explore": False,
"episodes_per_batch": 10,
"train_batch_size": 100,
"num_workers": 2,
"noise_size": 2500000,
"observation_filter": "MeanStdFilter"
},
"DQN": {},
"DQN": {
"explore": False
},
"APEX_DDPG": {
"explore": False,
"observation_filter": "MeanStdFilter",
"num_workers": 2,
"min_iter_time_s": 1,
@ -38,19 +44,21 @@ CONFIGS = {
},
},
"DDPG": {
"pure_exploration_steps": 0,
"exploration_ou_noise_scale": 0.0,
"explore": False,
"timesteps_per_iteration": 100
},
"PPO": {
"explore": False,
"num_sgd_iter": 5,
"train_batch_size": 1000,
"num_workers": 2
},
"A3C": {
"explore": False,
"num_workers": 1
},
"ARS": {
"explore": False,
"num_rollouts": 10,
"num_workers": 2,
"noise_size": 2500000,
@ -70,7 +78,7 @@ def test_ckpt_restore(use_object_store, alg_name, failures):
alg2 = cls(config=CONFIGS[name], env="CartPole-v0")
env = gym.make("CartPole-v0")
for _ in range(3):
for _ in range(2):
res = alg1.train()
print("current status: " + str(res))

View file

@ -4,6 +4,8 @@ import unittest
import ray
import ray.rllib.agents.a3c as a3c
import ray.rllib.agents.ddpg as ddpg
import ray.rllib.agents.ddpg.td3 as td3
import ray.rllib.agents.dqn as dqn
import ray.rllib.agents.impala as impala
import ray.rllib.agents.pg as pg
@ -27,9 +29,12 @@ def test_explorations(run,
# Test all frameworks.
for fw in ["torch", "eager", "tf"]:
if fw == "torch" and \
run in [dqn.DQNTrainer, dqn.SimpleQTrainer,
impala.ImpalaTrainer, sac.SACTrainer]:
run in [ddpg.DDPGTrainer, dqn.DQNTrainer, dqn.SimpleQTrainer,
impala.ImpalaTrainer, sac.SACTrainer, td3.TD3Trainer]:
continue
elif fw == "eager" and run in [ddpg.DDPGTrainer, td3.TD3Trainer]:
continue
print("Testing {} in framework={}".format(run, fw))
config["eager"] = (fw == "eager")
config["use_pytorch"] = (fw == "torch")
@ -38,9 +43,8 @@ def test_explorations(run,
# exploration class.
for exploration in [None, "Random"]:
if exploration == "Random":
# TODO(sven): Random doesn't work for cont. action spaces
# or IMPALA yet.
if env == "Pendulum-v0" or run is impala.ImpalaTrainer:
# TODO(sven): Random doesn't work for IMPALA yet.
if run is impala.ImpalaTrainer:
continue
config["exploration_config"] = {"type": "Random"}
print("exploration={}".format(exploration or "default"))
@ -108,6 +112,14 @@ class TestExplorations(unittest.TestCase):
np.array([0.0, 0.1, 0.0, 0.0]),
prev_a=np.array(1))
def test_ddpg(self):
test_explorations(
ddpg.DDPGTrainer,
"Pendulum-v0",
ddpg.DEFAULT_CONFIG,
np.array([0.0, 0.1, 0.0]),
expected_mean_action=0.0)
def test_simple_dqn(self):
test_explorations(dqn.SimpleQTrainer, "CartPole-v0",
dqn.DEFAULT_CONFIG, np.array([0.0, 0.1, 0.0, 0.0]))
@ -157,6 +169,14 @@ class TestExplorations(unittest.TestCase):
np.array([0.0, 0.1, 0.0]),
expected_mean_action=0.0)
def test_td3(self):
test_explorations(
td3.TD3Trainer,
"Pendulum-v0",
td3.TD3_DEFAULT_CONFIG,
np.array([0.0, 0.1, 0.0]),
expected_mean_action=0.0)
if __name__ == "__main__":
unittest.main(verbosity=2)

View file

@ -176,7 +176,9 @@ class ModelSupportedSpaces(unittest.TestCase):
def test_ddpg(self):
check_support(
"DDPG", {
"exploration_ou_noise_scale": 100.0,
"exploration_config": {
"ou_base_scale": 100.0
},
"timesteps_per_iteration": 1,
"use_state_preprocessor": True,
},

View file

@ -15,14 +15,15 @@ halfcheetah-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 10000
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
exploration_fraction: 0.1
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.1
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
@ -47,7 +48,6 @@ halfcheetah-ddpg:
# === Parallelism ===
num_workers: 0
num_gpus_per_worker: 0
per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===

View file

@ -15,7 +15,8 @@ invertedpendulum-td3:
# === Exploration ===
learning_starts: 1000
pure_exploration_steps: 1000
exploration_config:
random_timesteps: 1000
# === Evaluation ===
evaluation_interval: 1

View file

@ -7,7 +7,8 @@ mountaincarcontinuous-apex-ddpg:
config:
clip_rewards: False
num_workers: 16
exploration_ou_noise_scale: 1.0
exploration_config:
ou_base_scale: 1.0
n_step: 3
target_network_update_freq: 50000
tau: 1.0

View file

@ -15,14 +15,16 @@ mountaincarcontinuous-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 40000
ou_base_scale: 0.75
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
exploration_fraction: 0.4
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.75
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.01

View file

@ -17,7 +17,8 @@ mujoco-td3:
config:
# === Exploration ===
learning_starts: 10000
pure_exploration_steps: 10000
exploration_config:
random_timesteps: 10000
# === Evaluation ===
evaluation_interval: 5

View file

@ -15,14 +15,16 @@ pendulum-ddpg:
env_config: {}
# === Exploration ===
exploration_should_anneal: True
schedule_max_timesteps: 100000
exploration_config:
type: "OrnsteinUhlenbeckNoise"
scale_timesteps: 10000
initial_scale: 1.0,
final_scale: 0.02
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 600
exploration_fraction: 0.1
exploration_final_scale: 0.02
exploration_ou_noise_scale: 0.1
exploration_ou_theta: 0.15
exploration_ou_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
@ -47,7 +49,7 @@ pendulum-ddpg:
# === Parallelism ===
num_workers: 0
num_gpus_per_worker: 0
per_worker_exploration: False
#per_worker_exploration: False
worker_side_prioritization: False
# === Evaluation ===

View file

@ -12,7 +12,8 @@ pendulum-ddpg:
# === Exploration ===
learning_starts: 5000
pure_exploration_steps: 5000
exploration_config:
random_timesteps: 5000
# === Evaluation ===
evaluation_interval: 1

View file

@ -7,4 +7,3 @@ pendulum-ddpg:
config:
use_huber: True
clip_rewards: False
exploration_fraction: 0.1

View file

@ -0,0 +1,6 @@
pendulum-td3:
env: Pendulum-v0
run: TD3
stop:
episode_reward_mean: -900
timesteps_total: 100000

View file

@ -1,7 +1,14 @@
from ray.rllib.utils.exploration.exploration import Exploration
from ray.rllib.utils.exploration.epsilon_greedy import EpsilonGreedy
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
OrnsteinUhlenbeckNoise
from ray.rllib.utils.exploration.per_worker_epsilon_greedy import \
PerWorkerEpsilonGreedy
from ray.rllib.utils.exploration.per_worker_gaussian_noise import \
PerWorkerGaussianNoise
from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
PerWorkerOrnsteinUhlenbeckNoise
from ray.rllib.utils.exploration.random import Random
from ray.rllib.utils.exploration.soft_q import SoftQ
from ray.rllib.utils.exploration.stochastic_sampling import \
@ -10,7 +17,11 @@ from ray.rllib.utils.exploration.stochastic_sampling import \
__all__ = [
"Exploration",
"EpsilonGreedy",
"GaussianNoise",
"OrnsteinUhlenbeckNoise",
"PerWorkerEpsilonGreedy",
"PerWorkerGaussianNoise",
"PerWorkerOrnsteinUhlenbeckNoise",
"Random",
"SoftQ",
"StochasticSampling",

View file

@ -23,10 +23,9 @@ class EpsilonGreedy(Exploration):
initial_epsilon=1.0,
final_epsilon=0.05,
epsilon_timesteps=int(1e5),
num_workers=None,
worker_index=None,
epsilon_schedule=None,
framework="tf"):
framework="tf",
**kwargs):
"""
Args:
@ -35,21 +34,13 @@ class EpsilonGreedy(Exploration):
final_epsilon (float): The final epsilon value to use.
epsilon_timesteps (int): The time step after which epsilon should
always be `final_epsilon`.
num_workers (Optional[int]): The overall number of workers used.
worker_index (Optional[int]): The index of the Worker using this
Exploration.
epsilon_schedule (Optional[Schedule]): An optional Schedule object
to use (instead of constructing one from the given parameters).
framework (Optional[str]): One of None, "tf", "torch".
"""
# For now, require Discrete action space (may loosen this restriction
# in the future).
assert framework is not None
super().__init__(
action_space=action_space,
num_workers=num_workers,
worker_index=worker_index,
framework=framework)
action_space=action_space, framework=framework, **kwargs)
self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
endpoints=[(0, initial_epsilon),
@ -85,16 +76,15 @@ class EpsilonGreedy(Exploration):
Returns:
tf.Tensor: The tf exploration-action op.
"""
epsilon = tf.convert_to_tensor(
self.epsilon_schedule(timestep if timestep is not None else
self.last_timestep))
epsilon = self.epsilon_schedule(timestep if timestep is not None else
self.last_timestep)
# Get the exploit action as the one with the highest logit value.
exploit_action = tf.argmax(q_values, axis=1)
batch_size = tf.shape(q_values)[0]
# Mask out actions with q-value=-inf so that we don't
# even consider them for exploration.
# Mask out actions with q-value=-inf so that we don't even consider
# them for exploration.
random_valid_action_logits = tf.where(
tf.equal(q_values, tf.float32.min),
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
@ -130,7 +120,7 @@ class EpsilonGreedy(Exploration):
Returns:
torch.Tensor: The exploration-action.
"""
# Set last time step or (if not given) increase by one.
# Set last timestep or (if not given) increase by one.
self.last_timestep = timestep if timestep is not None else \
self.last_timestep + 1

View file

@ -13,6 +13,7 @@ class Exploration:
def __init__(self,
action_space=None,
*,
num_workers=None,
worker_index=None,
framework="tf"):

View file

@ -0,0 +1,165 @@
from ray.rllib.utils.annotations import override
from ray.rllib.utils.exploration.exploration import Exploration
from ray.rllib.utils.exploration.random import Random
from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
get_variable
from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule
tf = try_import_tf()
torch, _ = try_import_torch()
class GaussianNoise(Exploration):
"""An exploration that adds white noise to continuous actions.
If explore=True, returns actions plus scale (<-annealed over time) x
Gaussian noise. Also, some completely random period is possible at the
beginning.
If explore=False, returns the deterministic action.
"""
def __init__(self,
action_space,
*,
random_timesteps=1000,
stddev=0.1,
initial_scale=1.0,
final_scale=0.02,
scale_timesteps=10000,
scale_schedule=None,
framework="tf",
**kwargs):
"""Initializes a GaussianNoise Exploration object.
Args:
action_space (Space): The gym action space used by the environment.
random_timesteps (int): The number of timesteps for which to act
completely randomly. Only after this number of timesteps, the
`self.scale` annealing process will start (see below).
stddev (float): The stddev (sigma) to use for the
Gaussian noise to be added to the actions.
initial_scale (float): The initial scaling weight to multiply
the noise with.
final_scale (float): The final scaling weight to multiply
the noise with.
scale_timesteps (int): The timesteps over which to linearly anneal
the scaling factor (after(!) having used random actions for
`random_timesteps` steps.
scale_schedule (Optional[Schedule]): An optional Schedule object
to use (instead of constructing one from the given parameters).
framework (Optional[str]): One of None, "tf", "torch".
"""
assert framework is not None
super().__init__(action_space, framework=framework, **kwargs)
self.random_timesteps = random_timesteps
self.random_exploration = Random(
action_space, framework=self.framework)
self.stddev = stddev
# The `scale` annealing schedule.
self.scale_schedule = scale_schedule or PiecewiseSchedule(
endpoints=[(random_timesteps, initial_scale),
(random_timesteps + scale_timesteps, final_scale)],
outside_value=final_scale,
framework=self.framework)
# The current timestep value (tf-var or python int).
self.last_timestep = get_variable(
0, framework=self.framework, tf_name="timestep")
@override(Exploration)
def get_exploration_action(self,
distribution_inputs,
action_dist_class,
model=None,
explore=True,
timestep=None):
# Adds IID Gaussian noise for exploration, TD3-style.
action_dist = action_dist_class(distribution_inputs, model)
if self.framework == "torch":
return self._get_torch_exploration_action(action_dist, explore,
timestep)
else:
return self._get_tf_exploration_action_op(action_dist, explore,
timestep)
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
ts = timestep if timestep is not None else self.last_timestep
# The deterministic actions (if explore=False).
deterministic_actions = action_dist.deterministic_sample()
# Take a Gaussian sample with our stddev (mean=0.0) and scale it.
gaussian_sample = self.scale_schedule(ts) * tf.random_normal(
tf.shape(deterministic_actions), stddev=self.stddev)
# Stochastic actions could either be: random OR action + noise.
random_actions, _ = \
self.random_exploration.get_tf_exploration_action_op(
action_dist, explore)
stochastic_actions = tf.cond(
pred=ts <= self.random_timesteps,
true_fn=lambda: random_actions,
false_fn=lambda: tf.clip_by_value(
deterministic_actions + gaussian_sample,
self.action_space.low * tf.ones_like(deterministic_actions),
self.action_space.high * tf.ones_like(deterministic_actions))
)
# Chose by `explore` (main exploration switch).
batch_size = tf.shape(deterministic_actions)[0]
action = tf.cond(
pred=tf.constant(explore, dtype=tf.bool)
if isinstance(explore, bool) else explore,
true_fn=lambda: stochastic_actions,
false_fn=lambda: deterministic_actions)
# Logp=always zero.
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
# Increment `last_timestep` by 1 (or set to `timestep`).
assign_op = \
tf.assign_add(self.last_timestep, 1) if timestep is None else \
tf.assign(self.last_timestep, timestep)
with tf.control_dependencies([assign_op]):
return action, logp
def _get_torch_exploration_action(self, action_dist, explore, timestep):
# Set last timestep or (if not given) increase by one.
self.last_timestep = timestep if timestep is not None else \
self.last_timestep + 1
# Apply exploration.
if explore:
# Random exploration phase.
if self.last_timestep <= self.random_timesteps:
action, _ = \
self.random_exploration.get_torch_exploration_action(
action_dist, True)
# Take a Gaussian sample with our stddev (mean=0.0) and scale it.
else:
det_actions = action_dist.deterministic_sample()
scale = self.scale_schedule(self.last_timestep)
gaussian_sample = scale * torch.normal(
mean=0.0, stddev=self.stddev, size=det_actions.size())
action = torch.clamp(
det_actions + gaussian_sample,
self.action_space.low * torch.ones_like(det_actions),
self.action_space.high * torch.ones_like(det_actions))
# No exploration -> Return deterministic actions.
else:
action = action_dist.deterministic_sample()
# Logp=always zero.
logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
return action, logp
@override(Exploration)
def get_info(self):
"""Returns the current scale value.
Returns:
Union[float,tf.Tensor[float]]: The current scale value.
"""
return self.scale_schedule(self.last_timestep)

View file

@ -0,0 +1,165 @@
from ray.rllib.utils.annotations import override
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
get_variable
tf = try_import_tf()
torch, _ = try_import_torch()
class OrnsteinUhlenbeckNoise(GaussianNoise):
"""An exploration that adds Ornstein-Uhlenbeck noise to continuous actions.
If explore=True, returns sampled actions plus a noise term X,
which changes according to this formula:
Xt+1 = -theta*Xt + sigma*N[0,stddev], where theta, sigma and stddev are
constants. Also, some completely random period is possible at the
beginning.
If explore=False, returns the deterministic action.
"""
def __init__(self,
action_space,
*,
ou_theta=0.15,
ou_sigma=0.2,
ou_base_scale=0.1,
random_timesteps=1000,
initial_scale=1.0,
final_scale=0.02,
scale_timesteps=10000,
scale_schedule=None,
framework="tf",
**kwargs):
"""Initializes an Ornstein-Uhlenbeck Exploration object.
Args:
action_space (Space): The gym action space used by the environment.
ou_theta (float): The theta parameter of the Ornstein-Uhlenbeck
process.
ou_sigma (float): The sigma parameter of the Ornstein-Uhlenbeck
process.
ou_base_scale (float): A fixed scaling factor, by which all OU-
noise is multiplied. NOTE: This is on top of the parent
GaussianNoise's scaling.
random_timesteps (int): The number of timesteps for which to act
completely randomly. Only after this number of timesteps, the
`self.scale` annealing process will start (see below).
initial_scale (float): The initial scaling weight to multiply
the noise with.
final_scale (float): The final scaling weight to multiply
the noise with.
scale_timesteps (int): The timesteps over which to linearly anneal
the scaling factor (after(!) having used random actions for
`random_timesteps` steps.
scale_schedule (Optional[Schedule]): An optional Schedule object
to use (instead of constructing one from the given parameters).
framework (Optional[str]): One of None, "tf", "torch".
"""
super().__init__(
action_space,
random_timesteps=random_timesteps,
initial_scale=initial_scale,
final_scale=final_scale,
scale_timesteps=scale_timesteps,
scale_schedule=scale_schedule,
stddev=1.0, # Force `self.stddev` to 1.0.
framework=framework,
**kwargs)
self.ou_theta = ou_theta
self.ou_sigma = ou_sigma
self.ou_base_scale = ou_base_scale
# The current OU-state value (gets updated each time, an eploration
# action is computed).
self.ou_state = get_variable(
self.action_space.low.size * [.0],
framework=self.framework,
tf_name="ou_state")
@override(GaussianNoise)
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
ts = timestep if timestep is not None else self.last_timestep
scale = self.scale_schedule(ts)
# The deterministic actions (if explore=False).
deterministic_actions = action_dist.deterministic_sample()
# Apply base-scaled and time-annealed scaled OU-noise to
# deterministic actions.
gaussian_sample = tf.random_normal(
shape=[self.action_space.low.size], stddev=self.stddev)
ou_new = self.ou_theta * -self.ou_state + \
self.ou_sigma * gaussian_sample
ou_state_new = tf.assign_add(self.ou_state, ou_new)
noise = scale * self.ou_base_scale * ou_state_new * \
(self.action_space.high - self.action_space.low)
stochastic_actions = tf.clip_by_value(
deterministic_actions + noise,
self.action_space.low * tf.ones_like(deterministic_actions),
self.action_space.high * tf.ones_like(deterministic_actions))
# Stochastic actions could either be: random OR action + noise.
random_actions, _ = \
self.random_exploration.get_tf_exploration_action_op(
action_dist, explore)
exploration_actions = tf.cond(
pred=ts <= self.random_timesteps,
true_fn=lambda: random_actions,
false_fn=lambda: stochastic_actions)
# Chose by `explore` (main exploration switch).
action = tf.cond(
pred=tf.constant(explore, dtype=tf.bool)
if isinstance(explore, bool) else explore,
true_fn=lambda: exploration_actions,
false_fn=lambda: deterministic_actions)
# Logp=always zero.
batch_size = tf.shape(deterministic_actions)[0]
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
# Increment `last_timestep` by 1 (or set to `timestep`).
assign_op = \
tf.assign_add(self.last_timestep, 1) if timestep is None else \
tf.assign(self.last_timestep, timestep)
with tf.control_dependencies([assign_op, ou_state_new]):
return action, logp
@override(GaussianNoise)
def _get_torch_exploration_action(self, action_dist, explore, timestep):
# Set last timestep or (if not given) increase by one.
self.last_timestep = timestep if timestep is not None else \
self.last_timestep + 1
# Apply exploration.
if explore:
# Random exploration phase.
if self.last_timestep <= self.random_timesteps:
action = self.random_exploration.get_torch_exploration_action(
action_dist, True)
# Apply base-scaled and time-annealed scaled OU-noise to
# deterministic actions.
else:
det_actions = action_dist.deterministic_sample()
scale = self.scale_schedule(self.last_timestep)
gaussian_sample = scale * torch.normal(
mean=0.0, stddev=1.0, size=det_actions.size())
ou_new = self.ou_theta * -self.ou_state + \
self.ou_sigma * gaussian_sample
self.ou_state += ou_new
noise = scale * self.ou_base_scale * self.ou_state * \
(self.action_space.high - self.action_space.low)
action = torch.clamp(
det_actions + noise,
self.action_space.low * torch.ones_like(det_actions),
self.action_space.high * torch.ones_like(det_actions))
# No exploration -> Return deterministic actions.
else:
action = action_dist.deterministic_sample()
# Logp=always zero.
logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
return action, logp

View file

@ -12,19 +12,14 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):
def __init__(self,
action_space,
initial_epsilon=1.0,
final_epsilon=0.1,
epsilon_timesteps=int(1e5),
*,
num_workers=0,
worker_index=0,
framework="tf"):
framework="tf",
**kwargs):
"""
Args:
action_space (Space): The gym action space used by the environment.
initial_epsilon (float): The initial epsilon value to use.
final_epsilon (float): The final epsilon value to use.
epsilon_timesteps (int): The time step after which epsilon should
always be `final_epsilon`.
num_workers (Optional[int]): The overall number of workers used.
worker_index (Optional[int]): The index of the Worker using this
Exploration.
@ -42,11 +37,7 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):
epsilon_schedule = ConstantSchedule(0.0)
super().__init__(
action_space=action_space,
initial_epsilon=initial_epsilon,
final_epsilon=final_epsilon,
epsilon_timesteps=epsilon_timesteps,
num_workers=num_workers,
worker_index=worker_index,
action_space,
epsilon_schedule=epsilon_schedule,
framework=framework,
epsilon_schedule=epsilon_schedule)
**kwargs)

View file

@ -0,0 +1,43 @@
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
from ray.rllib.utils.schedules import ConstantSchedule
class PerWorkerGaussianNoise(GaussianNoise):
"""A per-worker Gaussian noise class for distributed algorithms.
Sets the `scale` schedules of individual workers to a constant:
0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
See Ape-X paper.
"""
def __init__(self,
action_space,
*,
num_workers=0,
worker_index=0,
framework="tf",
**kwargs):
"""
Args:
action_space (Space): The gym action space used by the environment.
num_workers (Optional[int]): The overall number of workers used.
worker_index (Optional[int]): The index of the Worker using this
Exploration.
framework (Optional[str]): One of None, "tf", "torch".
"""
scale_schedule = None
# Use a fixed, different epsilon per worker. See: Ape-X paper.
if num_workers > 0:
if worker_index >= 0:
exponent = (1 + worker_index / float(num_workers - 1) * 7)
scale_schedule = ConstantSchedule(0.4**exponent)
# Local worker should have zero exploration so that eval
# rollouts run properly.
else:
scale_schedule = ConstantSchedule(0.0)
super().__init__(
action_space,
scale_schedule=scale_schedule,
framework=framework,
**kwargs)

View file

@ -0,0 +1,44 @@
from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
OrnsteinUhlenbeckNoise
from ray.rllib.utils.schedules import ConstantSchedule
class PerWorkerOrnsteinUhlenbeckNoise(OrnsteinUhlenbeckNoise):
"""A per-worker Ornstein Uhlenbeck noise class for distributed algorithms.
Sets the Gaussian `scale` schedules of individual workers to a constant:
0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
See Ape-X paper.
"""
def __init__(self,
action_space,
*,
num_workers=0,
worker_index=0,
framework="tf",
**kwargs):
"""
Args:
action_space (Space): The gym action space used by the environment.
num_workers (Optional[int]): The overall number of workers used.
worker_index (Optional[int]): The index of the Worker using this
Exploration.
framework (Optional[str]): One of None, "tf", "torch".
"""
scale_schedule = None
# Use a fixed, different epsilon per worker. See: Ape-X paper.
if num_workers > 0:
if worker_index >= 0:
exponent = (1 + worker_index / float(num_workers - 1) * 7)
scale_schedule = ConstantSchedule(0.4**exponent)
# Local worker should have zero exploration so that eval
# rollouts run properly.
else:
scale_schedule = ConstantSchedule(0.0)
super().__init__(
action_space,
scale_schedule=scale_schedule,
framework=framework,
**kwargs)

View file

@ -1,4 +1,4 @@
from gym.spaces import Discrete
from gym.spaces import Discrete, MultiDiscrete, Tuple
from ray.rllib.utils.annotations import override
from ray.rllib.utils.exploration.exploration import Exploration
@ -18,17 +18,24 @@ class Random(Exploration):
If explore=False, returns the greedy/max-likelihood action.
"""
def __init__(self, action_space, framework="tf", **kwargs):
def __init__(self, action_space, *, framework="tf", **kwargs):
"""Initialize a Random Exploration object.
Args:
action_space (Space): The gym action space used by the environment.
framework (Optional[str]): One of None, "tf", "torch".
"""
assert isinstance(action_space, Discrete)
super().__init__(
action_space=action_space, framework=framework, **kwargs)
# Determine py_func types, depending on our action-space.
if isinstance(self.action_space, (Discrete, MultiDiscrete)) or \
(isinstance(self.action_space, Tuple) and
isinstance(self.action_space[0], (Discrete, MultiDiscrete))):
self.dtype_sample, self.dtype = (tf.int64, tf.int32)
else:
self.dtype_sample, self.dtype = (tf.float64, tf.float32)
@override(Exploration)
def get_exploration_action(self,
distribution_inputs,
@ -38,23 +45,22 @@ class Random(Exploration):
timestep=None):
# Instantiate the distribution object.
action_dist = action_dist_class(distribution_inputs, model)
if self.framework == "tf":
return self._get_tf_exploration_action_op(action_dist, explore,
timestep)
return self.get_tf_exploration_action_op(action_dist, explore)
else:
return self._get_torch_exploration_action(action_dist, explore,
timestep)
return self.get_torch_exploration_action(action_dist, explore)
@tf_function(tf)
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
def get_tf_exploration_action_op(self, action_dist, explore):
if explore:
action = tf.py_function(self.action_space.sample, [], tf.int64)
action = tf.py_function(self.action_space.sample, [],
self.dtype_sample)
# Will be unnecessary, once we support batch/time-aware Spaces.
action = tf.expand_dims(tf.cast(action, dtype=tf.int32), 0)
action = tf.expand_dims(tf.cast(action, dtype=self.dtype), 0)
else:
action = tf.cast(
action_dist.deterministic_sample(), dtype=tf.int32)
action_dist.deterministic_sample(), dtype=self.dtype)
# TODO(sven): Move into (deterministic_)sample(logp=True|False)
if isinstance(action, TupleActions):
batch_size = tf.shape(action[0][0])[0]
@ -63,12 +69,15 @@ class Random(Exploration):
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
return action, logp
def _get_torch_exploration_action(self, action_dist, explore, timestep):
def get_torch_exploration_action(self, action_dist, explore):
tensor_fn = torch.LongTensor if \
type(self.action_space) in [Discrete, MultiDiscrete] else \
torch.FloatTensor
if explore:
# Unsqueeze will be unnecessary, once we support batch/time-aware
# Spaces.
action = torch.LongTensor(self.action_space.sample()).unsqueeze(0)
action = tensor_fn(self.action_space.sample()).unsqueeze(0)
else:
action = torch.LongTensor(action_dist.deterministic_sample())
action = tensor_fn(action_dist.deterministic_sample())
logp = torch.zeros((action.size()[0], ), dtype=torch.float32)
return action, logp

View file

@ -10,7 +10,11 @@ class SoftQ(StochasticSampling):
output divided by the temperature. Returns the argmax iff explore=False.
"""
def __init__(self, action_space, temperature=1.0, framework="tf",
def __init__(self,
action_space,
*,
temperature=1.0,
framework="tf",
**kwargs):
"""Initializes a SoftQ Exploration object.
@ -19,11 +23,10 @@ class SoftQ(StochasticSampling):
temperature (Schedule): The temperature to divide model outputs by
before creating the Categorical distribution to sample from.
framework (Optional[str]): One of None, "tf", "torch".
kwargs (dict): Passed on to super constructor.
"""
assert isinstance(action_space, Discrete)
super().__init__(
action_space=action_space,
action_space,
static_params=dict(temperature=temperature),
framework=framework,
**kwargs)

View file

@ -18,24 +18,24 @@ class StochasticSampling(Exploration):
def __init__(self,
action_space,
framework="tf",
*,
static_params=None,
time_dependent_params=None,
framework="tf",
**kwargs):
"""Initializes a StochasticSampling Exploration object.
Args:
action_space (Space): The gym action space used by the environment.
framework (Optional[str]): One of None, "tf", "torch".
static_params (Optional[dict]): Parameters to be passed as-is into
the action distribution class' constructor.
time_dependent_params (dict): Parameters to be evaluated based on
`timestep` and then passed into the action distribution
class' constructor.
framework (Optional[str]): One of None, "tf", "torch".
"""
assert framework is not None
super().__init__(
action_space=action_space, framework=framework, **kwargs)
super().__init__(action_space, framework=framework, **kwargs)
self.static_params = static_params or {}

View file

@ -47,6 +47,6 @@ class PiecewiseSchedule(Schedule):
alpha = float(t - l_t) / (r_t - l_t)
return self.interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
# t does not belong to any of the pieces, return `self.outside_value`.
assert self.outside_value is not None
return self.outside_value

View file

@ -39,11 +39,11 @@ class Schedule(metaclass=ABCMeta):
raise NotImplementedError
def value(self, t):
if self.framework == "tf" and tf.executing_eagerly() is False:
if self.framework == "tf":
return tf.cast(
tf.py_func(self._value, [t], tf.float64),
tf.py_function(self._value, [t], tf.float64),
tf.float32,
name="schedule-value")
name="schedule_value")
return self._value(t)
def __call__(self, t):