mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[RLlib] DDPG refactor and Exploration API action noise classes. (#7314)
* WIP. * WIP. * WIP. * WIP. * WIP. * Fix * WIP. * Add TD3 quick Pendulum regresison. * Cleanup. * Fix. * LINT. * Fix. * Sort quick_learning test cases, add TD3. * Sort quick_learning test cases, add TD3. * Revert test_checkpoint_restore.py (debugging) changes. * Fix old soft_q settings in documentation and test configs. * More doc fixes. * Fix test case. * Fix test case. * Lower test load. * WIP.
This commit is contained in:
parent
3c6b94f3f5
commit
83e06cd30a
41 changed files with 1294 additions and 777 deletions
|
@ -45,8 +45,7 @@ Then, we can tell DQN to train using these previously generated experiences with
|
|||
--config='{
|
||||
"input": "/tmp/cartpole-out",
|
||||
"input_evaluation": [],
|
||||
"exploration_final_eps": 0,
|
||||
"exploration_fraction": 0}'
|
||||
"explore": false}'
|
||||
|
||||
**Off-policy estimation:** Since the input experiences are not from running simulations, RLlib cannot report the true policy performance during training. However, you can use ``tensorboard --logdir=~/ray_results`` to monitor training progress via other metrics such as estimated Q-value. Alternatively, `off-policy estimation <https://arxiv.org/pdf/1511.03722.pdf>`__ can be used, which requires both the source and target action probabilities to be available (i.e., the ``action_prob`` batch key). For DQN, this means enabling soft Q learning so that actions are sampled from a probability distribution:
|
||||
|
||||
|
@ -58,8 +57,10 @@ Then, we can tell DQN to train using these previously generated experiences with
|
|||
--config='{
|
||||
"input": "/tmp/cartpole-out",
|
||||
"input_evaluation": ["is", "wis"],
|
||||
"soft_q": true,
|
||||
"softmax_temp": 1.0}'
|
||||
"exploration_config": {
|
||||
"type": "SoftQ",
|
||||
"temperature": 1.0,
|
||||
}'
|
||||
|
||||
This example plot shows the Q-value metric in addition to importance sampling (IS) and weighted importance sampling (WIS) gain estimates (>1.0 means there is an estimated improvement over the original policy):
|
||||
|
||||
|
@ -121,8 +122,7 @@ RLlib supports multiplexing inputs from multiple input sources, including simula
|
|||
"hdfs:/archive/cartpole": 0.3,
|
||||
"sampler": 0.3,
|
||||
},
|
||||
"exploration_final_eps": 0,
|
||||
"exploration_fraction": 0}'
|
||||
"explore": false}'
|
||||
|
||||
Scaling I/O throughput
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
786
rllib/BUILD
786
rllib/BUILD
|
@ -58,6 +58,22 @@ py_test(
|
|||
# Tag: agents_dir
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
# A2CTrainer
|
||||
py_test(
|
||||
name = "test_a2c",
|
||||
tags = ["agents_dir"],
|
||||
size = "small",
|
||||
srcs = ["agents/a3c/tests/test_a2c.py"]
|
||||
)
|
||||
|
||||
# DDPGTrainer
|
||||
py_test(
|
||||
name = "test_ddpg",
|
||||
tags = ["agents_dir"],
|
||||
size = "medium",
|
||||
srcs = ["agents/ddpg/tests/test_ddpg.py"]
|
||||
)
|
||||
|
||||
# DQNTrainer
|
||||
py_test(
|
||||
name = "test_dqn",
|
||||
|
@ -66,12 +82,12 @@ py_test(
|
|||
srcs = ["agents/dqn/tests/test_dqn.py"]
|
||||
)
|
||||
|
||||
# A2CTrainer
|
||||
# IMPALA
|
||||
py_test(
|
||||
name = "test_a2c",
|
||||
name = "test_vtrace",
|
||||
tags = ["agents_dir"],
|
||||
size = "small",
|
||||
srcs = ["agents/a3c/tests/test_a2c.py"]
|
||||
srcs = ["agents/impala/tests/test_vtrace.py"]
|
||||
)
|
||||
|
||||
# PGTrainer
|
||||
|
@ -91,12 +107,12 @@ py_test(
|
|||
"agents/ppo/tests/test.py"] # TODO(sven): Move down once PR 6889 merged
|
||||
)
|
||||
|
||||
# IMPALA
|
||||
# TD3Trainer
|
||||
py_test(
|
||||
name = "test_vtrace",
|
||||
name = "test_td3",
|
||||
tags = ["agents_dir"],
|
||||
size = "small",
|
||||
srcs = ["agents/impala/tests/test_vtrace.py"]
|
||||
size = "medium",
|
||||
srcs = ["agents/ddpg/tests/test_td3.py"]
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
|
@ -255,6 +271,390 @@ py_test(
|
|||
]
|
||||
)
|
||||
|
||||
|
||||
# DDPG/APEX-DDPG/TD3
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "MountainCarContinuous-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 0}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "MountainCarContinuous-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_ddpg_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "APEX_DDPG",
|
||||
"--ray-num-cpus", "8",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "APEX_DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_td3_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "TD3",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
# DQN/APEX
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0_no_dueling",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"lr\": 1e-3, \"exploration_config\": {\"epsilon_timesteps\": 10000, \"final_epsilon\": 0.02}, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train", "external_files"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"exploration_config\": {\"type\": \"SoftQ\"}}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_pong_deterministic_v4",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "PongDeterministic-v4",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"lr\": 1e-4, \"exploration_config\": {\"epsilon_timesteps\": 200000, \"final_epsilon\": 0.01}, \"buffer_size\": 10000, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "APEX",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
# ES
|
||||
|
||||
py_test(
|
||||
name = "test_es_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "ES",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_es_pong_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pong-v0",
|
||||
"--run", "ES",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
# IMPALA
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_num_aggregation_workers_2",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "5",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_buffers_2",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_buffers_2_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "PongDeterministic-v4",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"timesteps_total\": 40000}'",
|
||||
"--ray-object-store-memory=1000000000",
|
||||
"--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
# From test_rollout.sh (deprecated test file).
|
||||
py_test(
|
||||
name = "test_impala_rollout",
|
||||
main = "tests/test_rollout.py",
|
||||
data = ["train.py", "rollout.py"],
|
||||
tags = ["quick_train"],
|
||||
srcs = ["tests/test_rollout.py"]
|
||||
)
|
||||
|
||||
# MARWIL
|
||||
|
||||
py_test(
|
||||
name = "test_marwil_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train", "external_files"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "MARWIL",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
|
||||
]
|
||||
)
|
||||
|
||||
# PG
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_torch_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--torch",
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_torch_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--torch",
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_pong_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pong-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
# PPO/APPO
|
||||
|
||||
py_test(
|
||||
|
@ -424,378 +824,6 @@ py_test(
|
|||
]
|
||||
)
|
||||
|
||||
# ES
|
||||
|
||||
py_test(
|
||||
name = "test_es_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "ES",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_es_pong_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pong-v0",
|
||||
"--run", "ES",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
# DQN/APEX
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0_no_dueling",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"lr\": 1e-3, \"schedule_max_timesteps\": 100000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.02, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train", "external_files"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"soft_q\": true}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_dqn_pong_deterministic_v4",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "PongDeterministic-v4",
|
||||
"--run", "DQN",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"lr\": 1e-4, \"schedule_max_timesteps\": 2000000, \"buffer_size\": 10000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.01, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "APEX",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
# PG
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_torch_frozenlake_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--torch",
|
||||
"--env", "FrozenLake-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_torch_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--torch",
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
size = "small",
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
py_test(
|
||||
name = "test_pg_tf_pong_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pong-v0",
|
||||
"--run", "PG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
# DDPG/APEX-DDPG
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "MountainCarContinuous-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 0}'"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
py_test(
|
||||
name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "MountainCarContinuous-v0",
|
||||
"--run", "DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_ddpg_pendulum_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "APEX_DDPG",
|
||||
"--ray-num-cpus", "8",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4"
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "Pendulum-v0",
|
||||
"--run", "APEX_DDPG",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
# IMPALA
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_num_aggregation_workers_2",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
|
||||
"--ray-num-cpus", "5",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_buffers_2",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_cartpole_v0_buffers_2_lstm",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
|
||||
"--ray-num-cpus", "4",
|
||||
]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train"],
|
||||
args = [
|
||||
"--env", "PongDeterministic-v4",
|
||||
"--run", "IMPALA",
|
||||
"--stop", "'{\"timesteps_total\": 40000}'",
|
||||
"--ray-object-store-memory=1000000000",
|
||||
"--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
|
||||
]
|
||||
)
|
||||
|
||||
# From test_rollout.sh (deprecated test file).
|
||||
py_test(
|
||||
name = "test_impala_rollout",
|
||||
main = "tests/test_rollout.py",
|
||||
data = ["train.py", "rollout.py"],
|
||||
tags = ["quick_train"],
|
||||
srcs = ["tests/test_rollout.py"]
|
||||
)
|
||||
|
||||
# MARWIL
|
||||
|
||||
py_test(
|
||||
name = "test_marwil_cartpole_v0",
|
||||
main = "train.py", srcs = ["train.py"],
|
||||
tags = ["quick_train", "external_files"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
args = [
|
||||
"--env", "CartPole-v0",
|
||||
"--run", "MARWIL",
|
||||
"--stop", "'{\"training_iteration\": 1}'",
|
||||
"--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
|
||||
]
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Models and Distributions
|
||||
# rllib/models/
|
||||
|
|
|
@ -1,12 +1,5 @@
|
|||
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.ddpg.td3 import TD3Trainer
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
|
||||
DDPGAgent = renamed_agent(DDPGTrainer)
|
||||
|
||||
__all__ = [
|
||||
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
|
||||
"TD3Trainer", "DEFAULT_CONFIG"
|
||||
]
|
||||
__all__ = ["ApexDDPGTrainer", "DDPGTrainer", "DEFAULT_CONFIG", "TD3Trainer"]
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
|
||||
DEFAULT_CONFIG as DDPG_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
|
||||
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
|
||||
DDPG_CONFIG, # see also the options in ddpg.py, which are also supported
|
||||
{
|
||||
"optimizer": merge_dicts(
|
||||
DDPG_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"optimizer": {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
},
|
||||
"exploration_config": {
|
||||
"type": "PerWorkerOrnsteinUhlenbeckNoise"
|
||||
},
|
||||
"n_step": 3,
|
||||
"num_gpus": 0,
|
||||
"num_workers": 32,
|
||||
|
@ -21,7 +22,6 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
|
|||
"sample_batch_size": 50,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"worker_side_prioritization": True,
|
||||
"min_iter_time_s": 30,
|
||||
},
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
|
||||
from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
|
||||
from ray.rllib.utils.deprecation import deprecation_warning, \
|
||||
DEPRECATED_VALUE
|
||||
from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
|
||||
PerWorkerOrnsteinUhlenbeckNoise
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
|
@ -55,49 +58,35 @@ DEFAULT_CONFIG = with_common_config({
|
|||
"n_step": 1,
|
||||
|
||||
# === Exploration ===
|
||||
# Turns on annealing schedule for exploration noise. Exploration is
|
||||
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
|
||||
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
|
||||
# anneal noise, so this is False by default.
|
||||
"exploration_should_anneal": False,
|
||||
# Max num timesteps for annealing schedules.
|
||||
"schedule_max_timesteps": 100000,
|
||||
"exploration_config": {
|
||||
# DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
|
||||
# actions (after a possible pure random phase of n timesteps).
|
||||
"type": "OrnsteinUhlenbeckNoise",
|
||||
# For how many timesteps should we return completely random actions,
|
||||
# before we start adding (scaled) noise?
|
||||
"random_timesteps": 1000,
|
||||
# The OU-base scaling factor to always apply to action-added noise.
|
||||
"ou_base_scale": 0.1,
|
||||
# The OU theta param.
|
||||
"ou_theta": 0.15,
|
||||
# The OU sigma param.
|
||||
"ou_sigma": 0.2,
|
||||
# The initial noise scaling factor.
|
||||
"initial_scale": 1.0,
|
||||
# The final noise scaling factor.
|
||||
"final_scale": 1.0,
|
||||
# Timesteps over which to anneal scale (from initial to final values).
|
||||
"scale_timesteps": 10000,
|
||||
},
|
||||
# Number of env steps to optimize for before returning
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
"exploration_fraction": 0.1,
|
||||
# Final scaling multiplier for action noise (initial is 1.0)
|
||||
"exploration_final_scale": 0.02,
|
||||
# valid values: "ou" (time-correlated, like original DDPG paper),
|
||||
# "gaussian" (IID, like TD3 paper)
|
||||
"exploration_noise_type": "ou",
|
||||
# OU-noise scale; this can be used to scale down magnitude of OU noise
|
||||
# before adding to actions (requires "exploration_noise_type" to be "ou")
|
||||
"exploration_ou_noise_scale": 0.1,
|
||||
# theta for OU
|
||||
"exploration_ou_theta": 0.15,
|
||||
# sigma for OU
|
||||
"exploration_ou_sigma": 0.2,
|
||||
# gaussian stddev of act noise for exploration (requires
|
||||
# "exploration_noise_type" to be "gaussian")
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
# Until this many timesteps have elapsed, the agent's policy will be
|
||||
# ignored & it will instead take uniform random actions. Can be used in
|
||||
# conjunction with learning_starts (which controls when the first
|
||||
# optimization step happens) to decrease dependence of exploration &
|
||||
# optimization on initial policy parameters. Note that this will be
|
||||
# disabled when the action noise scale is set to 0 (e.g during evaluation).
|
||||
"pure_exploration_steps": 1000,
|
||||
# Extra configuration that disables exploration.
|
||||
"evaluation_config": {
|
||||
"exploration_fraction": 0,
|
||||
"exploration_final_eps": 0,
|
||||
"explore": False
|
||||
},
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
|
@ -150,8 +139,6 @@ DEFAULT_CONFIG = with_common_config({
|
|||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you're using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
"worker_side_prioritization": False,
|
||||
# Prevent iterations from going lower than this time span
|
||||
|
@ -161,76 +148,47 @@ DEFAULT_CONFIG = with_common_config({
|
|||
# yapf: enable
|
||||
|
||||
|
||||
def make_exploration_schedule(config, worker_index):
|
||||
# Modification of DQN's schedule to take into account
|
||||
# `exploration_ou_noise_scale`
|
||||
if config["per_worker_exploration"]:
|
||||
assert config["num_workers"] > 1, "This requires multiple workers"
|
||||
if worker_index >= 0:
|
||||
# FIXME: what do magic constants mean? (0.4, 7)
|
||||
max_index = float(config["num_workers"] - 1)
|
||||
exponent = 1 + worker_index / max_index * 7
|
||||
return ConstantSchedule(0.4**exponent)
|
||||
else:
|
||||
# local ev should have zero exploration so that eval rollouts
|
||||
# run properly
|
||||
return ConstantSchedule(0.0)
|
||||
elif config["exploration_should_anneal"]:
|
||||
return PiecewiseSchedule(
|
||||
endpoints=[(0, 1.0), (int(config["exploration_fraction"] *
|
||||
config["schedule_max_timesteps"]),
|
||||
config["exploration_final_scale"])],
|
||||
outside_value=config["exploration_final_scale"])
|
||||
else:
|
||||
# *always* add exploration noise
|
||||
return ConstantSchedule(1.0)
|
||||
|
||||
|
||||
def setup_ddpg_exploration(trainer):
|
||||
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
|
||||
trainer.explorations = [
|
||||
make_exploration_schedule(trainer.config, i)
|
||||
for i in range(trainer.config["num_workers"])
|
||||
]
|
||||
|
||||
|
||||
def update_worker_explorations(trainer):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
exp_vals = [trainer.exploration0.value(global_timestep)]
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.set_epsilon(exp_vals[0]))
|
||||
for i, e in enumerate(trainer.workers.remote_workers()):
|
||||
exp_val = trainer.explorations[i].value(global_timestep)
|
||||
e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
|
||||
exp_vals.append(exp_val)
|
||||
trainer.train_start_timestep = global_timestep
|
||||
trainer.exploration_infos = exp_vals
|
||||
|
||||
|
||||
def add_pure_exploration_phase(trainer):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
pure_expl_steps = trainer.config["pure_exploration_steps"]
|
||||
if pure_expl_steps:
|
||||
# tell workers whether they should do pure exploration
|
||||
only_explore = global_timestep < pure_expl_steps
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
for e in trainer.workers.remote_workers():
|
||||
e.foreach_trainable_policy.remote(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
update_worker_explorations(trainer)
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
# PyTorch check.
|
||||
if config["use_pytorch"]:
|
||||
raise ValueError("DDPG does not support PyTorch yet! Use tf instead.")
|
||||
|
||||
# TODO(sven): Remove at some point.
|
||||
# Backward compatibility of noise-based exploration config.
|
||||
schedule_max_timesteps = None
|
||||
if config.get("schedule_max_timesteps", DEPRECATED_VALUE) != \
|
||||
DEPRECATED_VALUE:
|
||||
deprecation_warning("schedule_max_timesteps",
|
||||
"exploration_config.scale_timesteps")
|
||||
schedule_max_timesteps = config["schedule_max_timesteps"]
|
||||
if config.get("exploration_final_scale", DEPRECATED_VALUE) != \
|
||||
DEPRECATED_VALUE:
|
||||
deprecation_warning("exploration_final_scale",
|
||||
"exploration_config.final_scale")
|
||||
if isinstance(config["exploration_config"], dict):
|
||||
config["exploration_config"]["final_scale"] = \
|
||||
config.pop("exploration_final_scale")
|
||||
if config.get("exploration_fraction", DEPRECATED_VALUE) != \
|
||||
DEPRECATED_VALUE:
|
||||
assert schedule_max_timesteps is not None
|
||||
deprecation_warning("exploration_fraction",
|
||||
"exploration_config.scale_timesteps")
|
||||
if isinstance(config["exploration_config"], dict):
|
||||
config["exploration_config"]["scale_timesteps"] = config.pop(
|
||||
"exploration_fraction") * schedule_max_timesteps
|
||||
if config.get("per_worker_exploration", DEPRECATED_VALUE) != \
|
||||
DEPRECATED_VALUE:
|
||||
deprecation_warning(
|
||||
"per_worker_exploration",
|
||||
"exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise")
|
||||
if isinstance(config["exploration_config"], dict):
|
||||
config["exploration_config"]["type"] = \
|
||||
PerWorkerOrnsteinUhlenbeckNoise
|
||||
|
||||
|
||||
DDPGTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="DDPG",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=DDPGTFPolicy,
|
||||
validate_config=validate_config,
|
||||
before_init=setup_ddpg_exploration,
|
||||
before_train_step=add_pure_exploration_phase)
|
||||
)
|
||||
|
|
|
@ -7,6 +7,7 @@ from ray.rllib.agents.dqn.dqn_policy import postprocess_nstep_and_prio
|
|||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_action_dist import Deterministic
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.policy.policy import Policy
|
||||
|
@ -42,20 +43,24 @@ class DDPGPostprocessing:
|
|||
list(x) for x in sample_batch.columns(
|
||||
[SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
|
||||
]
|
||||
self.sess.run(self.remove_noise_op)
|
||||
clean_actions = self.sess.run(
|
||||
self.output_actions,
|
||||
self.sess.run(self.remove_parameter_noise_op)
|
||||
|
||||
# TODO(sven): This won't work if exploration != Noise, which is
|
||||
# probably fine as parameter_noise will soon be its own
|
||||
# Exploration class.
|
||||
clean_actions, cur_noise_scale = self.sess.run(
|
||||
[self.output_actions,
|
||||
self.exploration.get_info()],
|
||||
feed_dict={
|
||||
self.cur_observations: states,
|
||||
self.stochastic: False,
|
||||
self.noise_scale: .0,
|
||||
self.pure_exploration_phase: False,
|
||||
self._is_exploring: False,
|
||||
})
|
||||
distance_in_action_space = np.sqrt(
|
||||
np.mean(np.square(clean_actions - noisy_actions)))
|
||||
self.pi_distance = distance_in_action_space
|
||||
if distance_in_action_space < \
|
||||
self.config["exploration_ou_sigma"] * self.cur_noise_scale:
|
||||
self.config["exploration_config"].get("ou_sigma", 0.2) * \
|
||||
cur_noise_scale:
|
||||
# multiplying the sampled OU noise by noise scale is
|
||||
# equivalent to multiplying the sigma of OU by noise scale
|
||||
self.parameter_noise_sigma_val *= 1.01
|
||||
|
@ -82,14 +87,11 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
self.config = config
|
||||
self.cur_noise_scale = 1.0
|
||||
self.cur_pure_exploration_phase = False
|
||||
self.dim_actions = action_space.shape[0]
|
||||
self.low_action = action_space.low
|
||||
self.high_action = action_space.high
|
||||
|
||||
# create global step for counting the number of update operations
|
||||
# Create global step for counting the number of update operations.
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
# Create sampling timestep placeholder.
|
||||
timestep = tf.placeholder(tf.int32, (), name="timestep")
|
||||
|
||||
# use separate optimizers for actor & critic
|
||||
self._actor_optimizer = tf.train.AdamOptimizer(
|
||||
|
@ -97,11 +99,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
self._critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=self.config["critic_lr"])
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
|
||||
self.pure_exploration_phase = tf.placeholder(
|
||||
tf.bool, (), name="pure_exploration_phase")
|
||||
# Observation inputs.
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32,
|
||||
shape=(None, ) + observation_space.shape,
|
||||
|
@ -118,19 +116,14 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
var for var in self.policy_vars if "LayerNorm" not in var.name
|
||||
])
|
||||
|
||||
# Create exploration component.
|
||||
self.exploration = self._create_exploration(action_space, config)
|
||||
explore = tf.placeholder_with_default(True, (), name="is_exploring")
|
||||
# Action outputs
|
||||
with tf.variable_scope(ACTION_SCOPE):
|
||||
self.output_actions = self._add_exploration_noise(
|
||||
policy_out, self.stochastic, self.noise_scale,
|
||||
self.pure_exploration_phase, action_space)
|
||||
|
||||
if self.config["smooth_target_policy"]:
|
||||
self.reset_noise_op = tf.no_op()
|
||||
else:
|
||||
with tf.variable_scope(ACTION_SCOPE, reuse=True):
|
||||
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
|
||||
self.reset_noise_op = tf.assign(exploration_sample,
|
||||
self.dim_actions * [.0])
|
||||
self.output_actions, _ = self.exploration.get_exploration_action(
|
||||
policy_out, Deterministic, self.policy_model, explore,
|
||||
timestep)
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
|
@ -294,7 +287,9 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
sampled_action=self.output_actions,
|
||||
loss=self.actor_loss + self.critic_loss,
|
||||
loss_inputs=self.loss_inputs,
|
||||
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
|
||||
update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops,
|
||||
explore=explore,
|
||||
timestep=timestep)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
# Note that this encompasses both the policy and Q-value networks and
|
||||
|
@ -364,16 +359,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
+ self._critic_grads_and_vars
|
||||
return grads_and_vars
|
||||
|
||||
@override(TFPolicy)
|
||||
def extra_compute_action_feed_dict(self):
|
||||
return {
|
||||
# FIXME: what about turning off exploration? Isn't that a good
|
||||
# idea?
|
||||
self.stochastic: True,
|
||||
self.noise_scale: self.cur_noise_scale,
|
||||
self.pure_exploration_phase: self.cur_pure_exploration_phase,
|
||||
}
|
||||
|
||||
@override(TFPolicy)
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {
|
||||
|
@ -389,19 +374,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
@override(Policy)
|
||||
def get_state(self):
|
||||
return [
|
||||
TFPolicy.get_state(self), self.cur_noise_scale,
|
||||
self.cur_pure_exploration_phase
|
||||
]
|
||||
|
||||
@override(Policy)
|
||||
def set_state(self, state):
|
||||
TFPolicy.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
self.set_pure_exploration_phase(state[2])
|
||||
|
||||
def _build_q_network(self, obs, obs_space, action_space, actions):
|
||||
if self.config["use_state_preprocessor"]:
|
||||
q_model = ModelCatalog.get_model({
|
||||
|
@ -444,7 +416,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
action_out = tf.layers.dense(
|
||||
action_out, units=hidden, activation=activation)
|
||||
action_out = tf.layers.dense(
|
||||
action_out, units=self.dim_actions, activation=None)
|
||||
action_out, units=action_space.shape[0], activation=None)
|
||||
|
||||
# Use sigmoid to scale to [0,1], but also double magnitude of input to
|
||||
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
|
||||
|
@ -458,81 +430,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
|
||||
return actions, model
|
||||
|
||||
def _add_exploration_noise(self, deterministic_actions,
|
||||
should_be_stochastic, noise_scale,
|
||||
enable_pure_exploration, action_space):
|
||||
noise_type = self.config["exploration_noise_type"]
|
||||
action_low = action_space.low
|
||||
action_high = action_space.high
|
||||
action_range = action_space.high - action_low
|
||||
|
||||
def compute_stochastic_actions():
|
||||
def make_noisy_actions():
|
||||
# shape of deterministic_actions is [None, dim_action]
|
||||
if noise_type == "gaussian":
|
||||
# add IID Gaussian noise for exploration, TD3-style
|
||||
normal_sample = noise_scale * tf.random_normal(
|
||||
tf.shape(deterministic_actions),
|
||||
stddev=self.config["exploration_gaussian_sigma"])
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + normal_sample,
|
||||
action_low * tf.ones_like(deterministic_actions),
|
||||
action_high * tf.ones_like(deterministic_actions))
|
||||
elif noise_type == "ou":
|
||||
# add OU noise for exploration, DDPG-style
|
||||
zero_acts = action_low.size * [.0]
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=zero_acts,
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[action_low.size], mean=0.0, stddev=1.0)
|
||||
ou_new = self.config["exploration_ou_theta"] \
|
||||
* -exploration_sample \
|
||||
+ self.config["exploration_ou_sigma"] * normal_sample
|
||||
exploration_value = tf.assign_add(exploration_sample,
|
||||
ou_new)
|
||||
base_scale = self.config["exploration_ou_noise_scale"]
|
||||
noise = noise_scale * base_scale \
|
||||
* exploration_value * action_range
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + noise,
|
||||
action_low * tf.ones_like(deterministic_actions),
|
||||
action_high * tf.ones_like(deterministic_actions))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
|
||||
noise_type)
|
||||
return stochastic_actions
|
||||
|
||||
def make_uniform_random_actions():
|
||||
# pure random exploration option
|
||||
uniform_random_actions = tf.random_uniform(
|
||||
tf.shape(deterministic_actions))
|
||||
# rescale uniform random actions according to action range
|
||||
tf_range = tf.constant(action_range[None], dtype="float32")
|
||||
tf_low = tf.constant(action_low[None], dtype="float32")
|
||||
uniform_random_actions = uniform_random_actions * tf_range \
|
||||
+ tf_low
|
||||
return uniform_random_actions
|
||||
|
||||
stochastic_actions = tf.cond(
|
||||
# need to condition on noise_scale > 0 because zeroing
|
||||
# noise_scale is how a worker signals no noise should be used
|
||||
# (this is ugly and should be fixed by adding an "eval_mode"
|
||||
# config flag or something)
|
||||
tf.logical_and(enable_pure_exploration, noise_scale > 0),
|
||||
true_fn=make_uniform_random_actions,
|
||||
false_fn=make_noisy_actions)
|
||||
return stochastic_actions
|
||||
|
||||
enable_stochastic = tf.logical_and(should_be_stochastic,
|
||||
not self.config["parameter_noise"])
|
||||
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
return actions
|
||||
|
||||
def _build_actor_critic_loss(self,
|
||||
q_t,
|
||||
q_tp1,
|
||||
|
@ -580,7 +477,8 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
return critic_loss, actor_loss, td_error
|
||||
|
||||
def _build_parameter_noise(self, pnet_params):
|
||||
self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
|
||||
self.parameter_noise_sigma_val = \
|
||||
self.config["exploration_config"].get("ou_sigma", 0.2)
|
||||
self.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(
|
||||
self.parameter_noise_sigma_val),
|
||||
|
@ -600,7 +498,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
remove_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
remove_noise_ops.append(tf.assign_add(var, -var_noise))
|
||||
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
self.remove_parameter_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
generate_noise_ops = list()
|
||||
for var_noise in self.parameter_noise:
|
||||
generate_noise_ops.append(
|
||||
|
@ -630,9 +528,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
})
|
||||
return td_err
|
||||
|
||||
def reset_noise(self, sess):
|
||||
sess.run(self.reset_noise_op)
|
||||
|
||||
def add_parameter_noise(self):
|
||||
if self.config["parameter_noise"]:
|
||||
self.sess.run(self.add_noise_op)
|
||||
|
@ -642,13 +537,3 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
|
|||
tau = tau or self.tau_value
|
||||
return self.sess.run(
|
||||
self.update_target_expr, feed_dict={self.tau: tau})
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
# set_epsilon is called by optimizer to anneal exploration as
|
||||
# necessary, and to turn it off during evaluation. The "epsilon" part
|
||||
# is a carry-over from DQN, which uses epsilon-greedy exploration
|
||||
# rather than adding action noise to the output of a policy network.
|
||||
self.cur_noise_scale = epsilon
|
||||
|
||||
def set_pure_exploration_phase(self, pure_exploration_phase):
|
||||
self.cur_pure_exploration_phase = pure_exploration_phase
|
||||
|
|
|
@ -3,12 +3,10 @@
|
|||
By default, this uses a near-identical configuration to that reported in the
|
||||
TD3 paper.
|
||||
"""
|
||||
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
|
||||
DEFAULT_CONFIG as DDPG_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
TD3_DEFAULT_CONFIG = merge_dicts(
|
||||
TD3_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
|
||||
DDPG_CONFIG,
|
||||
{
|
||||
# largest changes: twin Q functions, delayed policy updates, and target
|
||||
|
@ -18,15 +16,27 @@ TD3_DEFAULT_CONFIG = merge_dicts(
|
|||
"smooth_target_policy": True,
|
||||
"target_noise": 0.2,
|
||||
"target_noise_clip": 0.5,
|
||||
"exploration_config": {
|
||||
# TD3 uses simple Gaussian noise on top of deterministic NN-output
|
||||
# actions (after a possible pure random phase of n timesteps).
|
||||
"type": "GaussianNoise",
|
||||
# For how many timesteps should we return completely random
|
||||
# actions, before we start adding (scaled) noise?
|
||||
"random_timesteps": 10000,
|
||||
# Gaussian stddev of action noise for exploration.
|
||||
"stddev": 0.1,
|
||||
# Scaling settings by which the Gaussian noise is scaled before
|
||||
# being added to the actions. NOTE: The scale timesteps start only
|
||||
# after(!) any random steps have been finished.
|
||||
# By default, do not anneal over time (fixed 1.0).
|
||||
"initial_scale": 1.0,
|
||||
"final_scale": 1.0,
|
||||
"scale_timesteps": 1
|
||||
},
|
||||
|
||||
# other changes & things we want to keep fixed: IID Gaussian
|
||||
# exploration noise, larger actor learning rate, no l2 regularisation,
|
||||
# no Huber loss, etc.
|
||||
"exploration_should_anneal": False,
|
||||
"exploration_noise_type": "gaussian",
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
# other changes & things we want to keep fixed:
|
||||
# larger actor learning rate, no l2 regularisation, no Huber loss, etc.
|
||||
"learning_starts": 10000,
|
||||
"pure_exploration_steps": 10000,
|
||||
"actor_hiddens": [400, 300],
|
||||
"critic_hiddens": [400, 300],
|
||||
"n_step": 1,
|
||||
|
@ -40,14 +50,12 @@ TD3_DEFAULT_CONFIG = merge_dicts(
|
|||
"target_network_update_freq": 0,
|
||||
"num_workers": 0,
|
||||
"num_gpus_per_worker": 0,
|
||||
"per_worker_exploration": False,
|
||||
"worker_side_prioritization": False,
|
||||
"buffer_size": 1000000,
|
||||
"prioritized_replay": False,
|
||||
"clip_rewards": False,
|
||||
"use_state_preprocessor": False,
|
||||
},
|
||||
)
|
||||
})
|
||||
|
||||
TD3Trainer = DDPGTrainer.with_updates(
|
||||
name="TD3", default_config=TD3_DEFAULT_CONFIG)
|
||||
|
|
87
rllib/agents/ddpg/tests/test_ddpg.py
Normal file
87
rllib/agents/ddpg/tests/test_ddpg.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import numpy as np
|
||||
import unittest
|
||||
|
||||
import ray.rllib.agents.ddpg as ddpg
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class TestDDPG(unittest.TestCase):
|
||||
def test_ddpg_compilation(self):
|
||||
"""Test whether a DDPGTrainer can be built with both frameworks."""
|
||||
config = ddpg.DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 0 # Run locally.
|
||||
|
||||
# Test against all frameworks.
|
||||
for fw in ["tf", "eager", "torch"]:
|
||||
if fw != "tf":
|
||||
continue
|
||||
config["eager"] = True if fw == "eager" else False
|
||||
config["use_pytorch"] = True if fw == "torch" else False
|
||||
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
|
||||
num_iterations = 2
|
||||
for i in range(num_iterations):
|
||||
results = trainer.train()
|
||||
print(results)
|
||||
|
||||
def test_ddpg_exploration_and_with_random_prerun(self):
|
||||
"""Tests DDPG's Exploration (w/ random actions for n timesteps)."""
|
||||
config = ddpg.DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 0 # Run locally.
|
||||
obs = np.array([0.0, 0.1, -0.1])
|
||||
|
||||
# Test against all frameworks.
|
||||
for fw in ["tf", "eager", "torch"]:
|
||||
if fw != "tf":
|
||||
continue
|
||||
config["eager"] = True if fw == "eager" else False
|
||||
config["use_pytorch"] = True if fw == "torch" else False
|
||||
|
||||
# Default OUNoise setup.
|
||||
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
|
||||
# Setting explore=False should always return the same action.
|
||||
a_ = trainer.compute_action(obs, explore=False)
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=False)
|
||||
check(a, a_)
|
||||
# explore=None (default: explore) should return different actions.
|
||||
actions = []
|
||||
for _ in range(50):
|
||||
actions.append(trainer.compute_action(obs))
|
||||
check(np.std(actions), 0.0, false=True)
|
||||
|
||||
# Check randomness at beginning.
|
||||
config["exploration_config"] = {
|
||||
# Act randomly at beginning ...
|
||||
"random_timesteps": 50,
|
||||
# Then act very closely to deterministic actions thereafter.
|
||||
"ou_base_scale": 0.001,
|
||||
"initial_scale": 0.001,
|
||||
"final_scale": 0.001,
|
||||
}
|
||||
trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
|
||||
# ts=1 (get a deterministic action as per explore=False).
|
||||
deterministic_action = trainer.compute_action(obs, explore=False)
|
||||
# ts=2-5 (in random window).
|
||||
random_a = []
|
||||
for _ in range(49):
|
||||
random_a.append(trainer.compute_action(obs, explore=True))
|
||||
check(random_a[-1], deterministic_action, false=True)
|
||||
self.assertTrue(np.std(random_a) > 0.5)
|
||||
|
||||
# ts > 50 (a=deterministic_action + scale * N[0,1])
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=True)
|
||||
check(a, deterministic_action, rtol=0.1)
|
||||
|
||||
# ts >> 50 (BUT: explore=False -> expect deterministic action).
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=False)
|
||||
check(a, deterministic_action)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
unittest.main(verbosity=1)
|
87
rllib/agents/ddpg/tests/test_td3.py
Normal file
87
rllib/agents/ddpg/tests/test_td3.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import numpy as np
|
||||
import unittest
|
||||
|
||||
import ray.rllib.agents.ddpg.td3 as td3
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class TestTD3(unittest.TestCase):
|
||||
def test_td3_compilation(self):
|
||||
"""Test whether a TD3Trainer can be built with both frameworks."""
|
||||
config = td3.TD3_DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 0 # Run locally.
|
||||
|
||||
# Test against all frameworks.
|
||||
for fw in ["tf", "eager", "torch"]:
|
||||
if fw != "tf":
|
||||
continue
|
||||
config["eager"] = True if fw == "eager" else False
|
||||
config["use_pytorch"] = True if fw == "torch" else False
|
||||
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
|
||||
num_iterations = 2
|
||||
for i in range(num_iterations):
|
||||
results = trainer.train()
|
||||
print(results)
|
||||
|
||||
def test_td3_exploration_and_with_random_prerun(self):
|
||||
"""Tests TD3's Exploration (w/ random actions for n timesteps)."""
|
||||
config = td3.TD3_DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 0 # Run locally.
|
||||
obs = np.array([0.0, 0.1, -0.1])
|
||||
|
||||
# Test against all frameworks.
|
||||
for fw in ["tf", "eager", "torch"]:
|
||||
if fw != "tf":
|
||||
continue
|
||||
config["eager"] = True if fw == "eager" else False
|
||||
config["use_pytorch"] = True if fw == "torch" else False
|
||||
|
||||
# Default GaussianNoise setup.
|
||||
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
|
||||
# Setting explore=False should always return the same action.
|
||||
a_ = trainer.compute_action(obs, explore=False)
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=False)
|
||||
check(a, a_)
|
||||
# explore=None (default: explore) should return different actions.
|
||||
actions = []
|
||||
for _ in range(50):
|
||||
actions.append(trainer.compute_action(obs))
|
||||
check(np.std(actions), 0.0, false=True)
|
||||
|
||||
# Check randomness at beginning.
|
||||
config["exploration_config"] = {
|
||||
# Act randomly at beginning ...
|
||||
"random_timesteps": 30,
|
||||
# Then act very closely to deterministic actions thereafter.
|
||||
"stddev": 0.001,
|
||||
"initial_scale": 0.001,
|
||||
"final_scale": 0.001,
|
||||
}
|
||||
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
|
||||
# ts=1 (get a deterministic action as per explore=False).
|
||||
deterministic_action = trainer.compute_action(obs, explore=False)
|
||||
# ts=2-5 (in random window).
|
||||
random_a = []
|
||||
for _ in range(29):
|
||||
random_a.append(trainer.compute_action(obs, explore=True))
|
||||
check(random_a[-1], deterministic_action, false=True)
|
||||
self.assertTrue(np.std(random_a) > 0.5)
|
||||
|
||||
# ts > 30 (a=deterministic_action + scale * N[0,1])
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=True)
|
||||
check(a, deterministic_action, rtol=0.1)
|
||||
|
||||
# ts >> 30 (BUT: explore=False -> expect deterministic action).
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=False)
|
||||
check(a, deterministic_action)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import unittest
|
||||
unittest.main(verbosity=1)
|
|
@ -211,7 +211,7 @@ def validate_config_and_setup_param_noise(config):
|
|||
if config.get("soft_q", DEPRECATED_VALUE) != DEPRECATED_VALUE:
|
||||
deprecation_warning(
|
||||
"soft_q", "exploration_config={"
|
||||
"type=StochasticSampling, temperature=[float]"
|
||||
"type=SoftQ, temperature=[float]"
|
||||
"}")
|
||||
config["exploration_config"] = {
|
||||
"type": "SoftQ",
|
||||
|
|
|
@ -15,14 +15,14 @@ class TestDQN(unittest.TestCase):
|
|||
config["num_workers"] = 0 # Run locally.
|
||||
|
||||
# tf.
|
||||
config["eager"] = True
|
||||
config["eager"] = False
|
||||
trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
|
||||
num_iterations = 2
|
||||
for i in range(num_iterations):
|
||||
results = trainer.train()
|
||||
print(results)
|
||||
|
||||
config["eager"] = False
|
||||
config["eager"] = True
|
||||
trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
|
||||
num_iterations = 2
|
||||
for i in range(num_iterations):
|
||||
|
|
|
@ -550,14 +550,11 @@ class Trainer(Trainable):
|
|||
else:
|
||||
self.env_creator = lambda env_config: None
|
||||
|
||||
# Merge the supplied config with the class default.
|
||||
merged_config = copy.deepcopy(self._default_config)
|
||||
merged_config = deep_update(merged_config, config,
|
||||
self._allow_unknown_configs,
|
||||
self._allow_unknown_subkeys,
|
||||
self._override_all_subkeys_if_type_changes)
|
||||
# Merge the supplied config with the class default, but store the
|
||||
# user-provided one.
|
||||
self.raw_user_config = config
|
||||
self.config = merged_config
|
||||
self.config = Trainer.merge_trainer_configs(self._default_config,
|
||||
config)
|
||||
|
||||
if self.config["normalize_actions"]:
|
||||
inner = self.env_creator
|
||||
|
@ -767,8 +764,7 @@ class Trainer(Trainable):
|
|||
preprocessed, update=False)
|
||||
|
||||
# Figure out the current (sample) time step and pass it into Policy.
|
||||
timestep = self.optimizer.num_steps_sampled \
|
||||
if self._has_policy_optimizer() else None
|
||||
self.global_vars["timestep"] += 1
|
||||
|
||||
result = self.get_policy(policy_id).compute_single_action(
|
||||
filtered_obs,
|
||||
|
@ -778,7 +774,7 @@ class Trainer(Trainable):
|
|||
info,
|
||||
clip_actions=self.config["clip_actions"],
|
||||
explore=explore,
|
||||
timestep=timestep)
|
||||
timestep=self.global_vars["timestep"])
|
||||
|
||||
if state or full_fetch:
|
||||
return result
|
||||
|
@ -878,6 +874,13 @@ class Trainer(Trainable):
|
|||
"the DEFAULT_CONFIG defined by each agent for more info.\n\n"
|
||||
"The config of this agent is: {}".format(config))
|
||||
|
||||
@classmethod
|
||||
def merge_trainer_configs(cls, config1, config2):
|
||||
config1 = copy.deepcopy(config1)
|
||||
return deep_update(config1, config2, cls._allow_unknown_configs,
|
||||
cls._allow_unknown_subkeys,
|
||||
cls._override_all_subkeys_if_type_changes)
|
||||
|
||||
@staticmethod
|
||||
def _validate_config(config):
|
||||
if "policy_graphs" in config["multiagent"]:
|
||||
|
|
|
@ -266,7 +266,8 @@ class SquashedGaussian(TFActionDistribution):
|
|||
class Deterministic(TFActionDistribution):
|
||||
"""Action distribution that returns the input values directly.
|
||||
|
||||
This is similar to DiagGaussian with standard deviation zero.
|
||||
This is similar to DiagGaussian with standard deviation zero (thus only
|
||||
requiring the "mean" values as NN output).
|
||||
"""
|
||||
|
||||
@override(ActionDistribution)
|
||||
|
|
|
@ -83,8 +83,8 @@ class OffPolicyEstimator:
|
|||
"Off-policy estimation is not possible unless the inputs "
|
||||
"include action probabilities (i.e., the policy is stochastic "
|
||||
"and emits the 'action_prob' key). For DQN this means using "
|
||||
"`soft_q: True`. You can also set `input_evaluation: []` to "
|
||||
"disable estimation.")
|
||||
"`exploration_config: {type: 'SoftQ'}`. You can also set "
|
||||
"`input_evaluation: []` to disable estimation.")
|
||||
|
||||
@DeveloperAPI
|
||||
def get_metrics(self):
|
||||
|
|
|
@ -37,7 +37,7 @@ def _convert_to_numpy(x):
|
|||
if x is None:
|
||||
return None
|
||||
try:
|
||||
return x.numpy()
|
||||
return tf.nest.map_structure(lambda component: component.numpy(), x)
|
||||
except AttributeError:
|
||||
raise TypeError(
|
||||
("Object of type {} has no method to convert to numpy.").format(
|
||||
|
@ -402,6 +402,10 @@ def build_eager_tf_policy(name,
|
|||
zip([(tf.convert_to_tensor(g) if g is not None else None)
|
||||
for g in gradients], self.model.trainable_variables()))
|
||||
|
||||
@override(Policy)
|
||||
def get_exploration_info(self):
|
||||
return _convert_to_numpy(self.exploration.get_info())
|
||||
|
||||
@override(Policy)
|
||||
def get_weights(self):
|
||||
variables = self.variables()
|
||||
|
|
|
@ -13,7 +13,6 @@ from ray.rllib.policy.sample_batch import SampleBatch
|
|||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.debug import summarize
|
||||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
|
||||
from ray.rllib.utils.tf_run_builder import TFRunBuilder
|
||||
|
@ -332,8 +331,7 @@ class TFPolicy(Policy):
|
|||
|
||||
@override(Policy)
|
||||
def get_exploration_info(self):
|
||||
if isinstance(self.exploration, Exploration):
|
||||
return self._sess.run(self.exploration_info)
|
||||
return self._sess.run(self.exploration_info)
|
||||
|
||||
@override(Policy)
|
||||
def get_weights(self):
|
||||
|
|
|
@ -20,16 +20,22 @@ def get_mean_action(alg, obs):
|
|||
ray.init(num_cpus=10, object_store_memory=1e9)
|
||||
|
||||
CONFIGS = {
|
||||
"SAC": {},
|
||||
"SAC": {
|
||||
"explore": False,
|
||||
},
|
||||
"ES": {
|
||||
"explore": False,
|
||||
"episodes_per_batch": 10,
|
||||
"train_batch_size": 100,
|
||||
"num_workers": 2,
|
||||
"noise_size": 2500000,
|
||||
"observation_filter": "MeanStdFilter"
|
||||
},
|
||||
"DQN": {},
|
||||
"DQN": {
|
||||
"explore": False
|
||||
},
|
||||
"APEX_DDPG": {
|
||||
"explore": False,
|
||||
"observation_filter": "MeanStdFilter",
|
||||
"num_workers": 2,
|
||||
"min_iter_time_s": 1,
|
||||
|
@ -38,19 +44,21 @@ CONFIGS = {
|
|||
},
|
||||
},
|
||||
"DDPG": {
|
||||
"pure_exploration_steps": 0,
|
||||
"exploration_ou_noise_scale": 0.0,
|
||||
"explore": False,
|
||||
"timesteps_per_iteration": 100
|
||||
},
|
||||
"PPO": {
|
||||
"explore": False,
|
||||
"num_sgd_iter": 5,
|
||||
"train_batch_size": 1000,
|
||||
"num_workers": 2
|
||||
},
|
||||
"A3C": {
|
||||
"explore": False,
|
||||
"num_workers": 1
|
||||
},
|
||||
"ARS": {
|
||||
"explore": False,
|
||||
"num_rollouts": 10,
|
||||
"num_workers": 2,
|
||||
"noise_size": 2500000,
|
||||
|
@ -70,7 +78,7 @@ def test_ckpt_restore(use_object_store, alg_name, failures):
|
|||
alg2 = cls(config=CONFIGS[name], env="CartPole-v0")
|
||||
env = gym.make("CartPole-v0")
|
||||
|
||||
for _ in range(3):
|
||||
for _ in range(2):
|
||||
res = alg1.train()
|
||||
print("current status: " + str(res))
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ import unittest
|
|||
|
||||
import ray
|
||||
import ray.rllib.agents.a3c as a3c
|
||||
import ray.rllib.agents.ddpg as ddpg
|
||||
import ray.rllib.agents.ddpg.td3 as td3
|
||||
import ray.rllib.agents.dqn as dqn
|
||||
import ray.rllib.agents.impala as impala
|
||||
import ray.rllib.agents.pg as pg
|
||||
|
@ -27,9 +29,12 @@ def test_explorations(run,
|
|||
# Test all frameworks.
|
||||
for fw in ["torch", "eager", "tf"]:
|
||||
if fw == "torch" and \
|
||||
run in [dqn.DQNTrainer, dqn.SimpleQTrainer,
|
||||
impala.ImpalaTrainer, sac.SACTrainer]:
|
||||
run in [ddpg.DDPGTrainer, dqn.DQNTrainer, dqn.SimpleQTrainer,
|
||||
impala.ImpalaTrainer, sac.SACTrainer, td3.TD3Trainer]:
|
||||
continue
|
||||
elif fw == "eager" and run in [ddpg.DDPGTrainer, td3.TD3Trainer]:
|
||||
continue
|
||||
|
||||
print("Testing {} in framework={}".format(run, fw))
|
||||
config["eager"] = (fw == "eager")
|
||||
config["use_pytorch"] = (fw == "torch")
|
||||
|
@ -38,9 +43,8 @@ def test_explorations(run,
|
|||
# exploration class.
|
||||
for exploration in [None, "Random"]:
|
||||
if exploration == "Random":
|
||||
# TODO(sven): Random doesn't work for cont. action spaces
|
||||
# or IMPALA yet.
|
||||
if env == "Pendulum-v0" or run is impala.ImpalaTrainer:
|
||||
# TODO(sven): Random doesn't work for IMPALA yet.
|
||||
if run is impala.ImpalaTrainer:
|
||||
continue
|
||||
config["exploration_config"] = {"type": "Random"}
|
||||
print("exploration={}".format(exploration or "default"))
|
||||
|
@ -108,6 +112,14 @@ class TestExplorations(unittest.TestCase):
|
|||
np.array([0.0, 0.1, 0.0, 0.0]),
|
||||
prev_a=np.array(1))
|
||||
|
||||
def test_ddpg(self):
|
||||
test_explorations(
|
||||
ddpg.DDPGTrainer,
|
||||
"Pendulum-v0",
|
||||
ddpg.DEFAULT_CONFIG,
|
||||
np.array([0.0, 0.1, 0.0]),
|
||||
expected_mean_action=0.0)
|
||||
|
||||
def test_simple_dqn(self):
|
||||
test_explorations(dqn.SimpleQTrainer, "CartPole-v0",
|
||||
dqn.DEFAULT_CONFIG, np.array([0.0, 0.1, 0.0, 0.0]))
|
||||
|
@ -157,6 +169,14 @@ class TestExplorations(unittest.TestCase):
|
|||
np.array([0.0, 0.1, 0.0]),
|
||||
expected_mean_action=0.0)
|
||||
|
||||
def test_td3(self):
|
||||
test_explorations(
|
||||
td3.TD3Trainer,
|
||||
"Pendulum-v0",
|
||||
td3.TD3_DEFAULT_CONFIG,
|
||||
np.array([0.0, 0.1, 0.0]),
|
||||
expected_mean_action=0.0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
|
|
@ -176,7 +176,9 @@ class ModelSupportedSpaces(unittest.TestCase):
|
|||
def test_ddpg(self):
|
||||
check_support(
|
||||
"DDPG", {
|
||||
"exploration_ou_noise_scale": 100.0,
|
||||
"exploration_config": {
|
||||
"ou_base_scale": 100.0
|
||||
},
|
||||
"timesteps_per_iteration": 1,
|
||||
"use_state_preprocessor": True,
|
||||
},
|
||||
|
|
|
@ -15,14 +15,15 @@ halfcheetah-ddpg:
|
|||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 10000
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
|
||||
timesteps_per_iteration: 1000
|
||||
exploration_fraction: 0.1
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.1
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
|
||||
|
@ -47,7 +48,6 @@ halfcheetah-ddpg:
|
|||
# === Parallelism ===
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
|
||||
# === Evaluation ===
|
||||
|
|
|
@ -15,7 +15,8 @@ invertedpendulum-td3:
|
|||
|
||||
# === Exploration ===
|
||||
learning_starts: 1000
|
||||
pure_exploration_steps: 1000
|
||||
exploration_config:
|
||||
random_timesteps: 1000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 1
|
||||
|
|
|
@ -7,7 +7,8 @@ mountaincarcontinuous-apex-ddpg:
|
|||
config:
|
||||
clip_rewards: False
|
||||
num_workers: 16
|
||||
exploration_ou_noise_scale: 1.0
|
||||
exploration_config:
|
||||
ou_base_scale: 1.0
|
||||
n_step: 3
|
||||
target_network_update_freq: 50000
|
||||
tau: 1.0
|
||||
|
|
|
@ -15,14 +15,16 @@ mountaincarcontinuous-ddpg:
|
|||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
exploration_config:
|
||||
initial_scale: 1.0
|
||||
final_scale: 0.02
|
||||
scale_timesteps: 40000
|
||||
ou_base_scale: 0.75
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
|
||||
timesteps_per_iteration: 1000
|
||||
exploration_fraction: 0.4
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.75
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
|
||||
target_network_update_freq: 0
|
||||
tau: 0.01
|
||||
|
||||
|
|
|
@ -17,7 +17,8 @@ mujoco-td3:
|
|||
config:
|
||||
# === Exploration ===
|
||||
learning_starts: 10000
|
||||
pure_exploration_steps: 10000
|
||||
exploration_config:
|
||||
random_timesteps: 10000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 5
|
||||
|
|
|
@ -15,14 +15,16 @@ pendulum-ddpg:
|
|||
env_config: {}
|
||||
|
||||
# === Exploration ===
|
||||
exploration_should_anneal: True
|
||||
schedule_max_timesteps: 100000
|
||||
exploration_config:
|
||||
type: "OrnsteinUhlenbeckNoise"
|
||||
scale_timesteps: 10000
|
||||
initial_scale: 1.0,
|
||||
final_scale: 0.02
|
||||
ou_base_scale: 0.1
|
||||
ou_theta: 0.15
|
||||
ou_sigma: 0.2
|
||||
|
||||
timesteps_per_iteration: 600
|
||||
exploration_fraction: 0.1
|
||||
exploration_final_scale: 0.02
|
||||
exploration_ou_noise_scale: 0.1
|
||||
exploration_ou_theta: 0.15
|
||||
exploration_ou_sigma: 0.2
|
||||
target_network_update_freq: 0
|
||||
tau: 0.001
|
||||
|
||||
|
@ -47,7 +49,7 @@ pendulum-ddpg:
|
|||
# === Parallelism ===
|
||||
num_workers: 0
|
||||
num_gpus_per_worker: 0
|
||||
per_worker_exploration: False
|
||||
#per_worker_exploration: False
|
||||
worker_side_prioritization: False
|
||||
|
||||
# === Evaluation ===
|
||||
|
|
|
@ -12,7 +12,8 @@ pendulum-ddpg:
|
|||
|
||||
# === Exploration ===
|
||||
learning_starts: 5000
|
||||
pure_exploration_steps: 5000
|
||||
exploration_config:
|
||||
random_timesteps: 5000
|
||||
|
||||
# === Evaluation ===
|
||||
evaluation_interval: 1
|
||||
|
|
|
@ -7,4 +7,3 @@ pendulum-ddpg:
|
|||
config:
|
||||
use_huber: True
|
||||
clip_rewards: False
|
||||
exploration_fraction: 0.1
|
||||
|
|
6
rllib/tuned_examples/regression_tests/pendulum-td3.yaml
Normal file
6
rllib/tuned_examples/regression_tests/pendulum-td3.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
pendulum-td3:
|
||||
env: Pendulum-v0
|
||||
run: TD3
|
||||
stop:
|
||||
episode_reward_mean: -900
|
||||
timesteps_total: 100000
|
|
@ -1,7 +1,14 @@
|
|||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
from ray.rllib.utils.exploration.epsilon_greedy import EpsilonGreedy
|
||||
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
|
||||
from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
|
||||
OrnsteinUhlenbeckNoise
|
||||
from ray.rllib.utils.exploration.per_worker_epsilon_greedy import \
|
||||
PerWorkerEpsilonGreedy
|
||||
from ray.rllib.utils.exploration.per_worker_gaussian_noise import \
|
||||
PerWorkerGaussianNoise
|
||||
from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
|
||||
PerWorkerOrnsteinUhlenbeckNoise
|
||||
from ray.rllib.utils.exploration.random import Random
|
||||
from ray.rllib.utils.exploration.soft_q import SoftQ
|
||||
from ray.rllib.utils.exploration.stochastic_sampling import \
|
||||
|
@ -10,7 +17,11 @@ from ray.rllib.utils.exploration.stochastic_sampling import \
|
|||
__all__ = [
|
||||
"Exploration",
|
||||
"EpsilonGreedy",
|
||||
"GaussianNoise",
|
||||
"OrnsteinUhlenbeckNoise",
|
||||
"PerWorkerEpsilonGreedy",
|
||||
"PerWorkerGaussianNoise",
|
||||
"PerWorkerOrnsteinUhlenbeckNoise",
|
||||
"Random",
|
||||
"SoftQ",
|
||||
"StochasticSampling",
|
||||
|
|
|
@ -23,10 +23,9 @@ class EpsilonGreedy(Exploration):
|
|||
initial_epsilon=1.0,
|
||||
final_epsilon=0.05,
|
||||
epsilon_timesteps=int(1e5),
|
||||
num_workers=None,
|
||||
worker_index=None,
|
||||
epsilon_schedule=None,
|
||||
framework="tf"):
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""
|
||||
|
||||
Args:
|
||||
|
@ -35,21 +34,13 @@ class EpsilonGreedy(Exploration):
|
|||
final_epsilon (float): The final epsilon value to use.
|
||||
epsilon_timesteps (int): The time step after which epsilon should
|
||||
always be `final_epsilon`.
|
||||
num_workers (Optional[int]): The overall number of workers used.
|
||||
worker_index (Optional[int]): The index of the Worker using this
|
||||
Exploration.
|
||||
epsilon_schedule (Optional[Schedule]): An optional Schedule object
|
||||
to use (instead of constructing one from the given parameters).
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
# For now, require Discrete action space (may loosen this restriction
|
||||
# in the future).
|
||||
assert framework is not None
|
||||
super().__init__(
|
||||
action_space=action_space,
|
||||
num_workers=num_workers,
|
||||
worker_index=worker_index,
|
||||
framework=framework)
|
||||
action_space=action_space, framework=framework, **kwargs)
|
||||
|
||||
self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
|
||||
endpoints=[(0, initial_epsilon),
|
||||
|
@ -85,16 +76,15 @@ class EpsilonGreedy(Exploration):
|
|||
Returns:
|
||||
tf.Tensor: The tf exploration-action op.
|
||||
"""
|
||||
epsilon = tf.convert_to_tensor(
|
||||
self.epsilon_schedule(timestep if timestep is not None else
|
||||
self.last_timestep))
|
||||
epsilon = self.epsilon_schedule(timestep if timestep is not None else
|
||||
self.last_timestep)
|
||||
|
||||
# Get the exploit action as the one with the highest logit value.
|
||||
exploit_action = tf.argmax(q_values, axis=1)
|
||||
|
||||
batch_size = tf.shape(q_values)[0]
|
||||
# Mask out actions with q-value=-inf so that we don't
|
||||
# even consider them for exploration.
|
||||
# Mask out actions with q-value=-inf so that we don't even consider
|
||||
# them for exploration.
|
||||
random_valid_action_logits = tf.where(
|
||||
tf.equal(q_values, tf.float32.min),
|
||||
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
|
||||
|
@ -130,7 +120,7 @@ class EpsilonGreedy(Exploration):
|
|||
Returns:
|
||||
torch.Tensor: The exploration-action.
|
||||
"""
|
||||
# Set last time step or (if not given) increase by one.
|
||||
# Set last timestep or (if not given) increase by one.
|
||||
self.last_timestep = timestep if timestep is not None else \
|
||||
self.last_timestep + 1
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ class Exploration:
|
|||
|
||||
def __init__(self,
|
||||
action_space=None,
|
||||
*,
|
||||
num_workers=None,
|
||||
worker_index=None,
|
||||
framework="tf"):
|
||||
|
|
165
rllib/utils/exploration/gaussian_noise.py
Normal file
165
rllib/utils/exploration/gaussian_noise.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
from ray.rllib.utils.exploration.random import Random
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
|
||||
get_variable
|
||||
from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
|
||||
class GaussianNoise(Exploration):
|
||||
"""An exploration that adds white noise to continuous actions.
|
||||
|
||||
If explore=True, returns actions plus scale (<-annealed over time) x
|
||||
Gaussian noise. Also, some completely random period is possible at the
|
||||
beginning.
|
||||
If explore=False, returns the deterministic action.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
*,
|
||||
random_timesteps=1000,
|
||||
stddev=0.1,
|
||||
initial_scale=1.0,
|
||||
final_scale=0.02,
|
||||
scale_timesteps=10000,
|
||||
scale_schedule=None,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""Initializes a GaussianNoise Exploration object.
|
||||
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
random_timesteps (int): The number of timesteps for which to act
|
||||
completely randomly. Only after this number of timesteps, the
|
||||
`self.scale` annealing process will start (see below).
|
||||
stddev (float): The stddev (sigma) to use for the
|
||||
Gaussian noise to be added to the actions.
|
||||
initial_scale (float): The initial scaling weight to multiply
|
||||
the noise with.
|
||||
final_scale (float): The final scaling weight to multiply
|
||||
the noise with.
|
||||
scale_timesteps (int): The timesteps over which to linearly anneal
|
||||
the scaling factor (after(!) having used random actions for
|
||||
`random_timesteps` steps.
|
||||
scale_schedule (Optional[Schedule]): An optional Schedule object
|
||||
to use (instead of constructing one from the given parameters).
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
assert framework is not None
|
||||
super().__init__(action_space, framework=framework, **kwargs)
|
||||
|
||||
self.random_timesteps = random_timesteps
|
||||
self.random_exploration = Random(
|
||||
action_space, framework=self.framework)
|
||||
self.stddev = stddev
|
||||
# The `scale` annealing schedule.
|
||||
self.scale_schedule = scale_schedule or PiecewiseSchedule(
|
||||
endpoints=[(random_timesteps, initial_scale),
|
||||
(random_timesteps + scale_timesteps, final_scale)],
|
||||
outside_value=final_scale,
|
||||
framework=self.framework)
|
||||
|
||||
# The current timestep value (tf-var or python int).
|
||||
self.last_timestep = get_variable(
|
||||
0, framework=self.framework, tf_name="timestep")
|
||||
|
||||
@override(Exploration)
|
||||
def get_exploration_action(self,
|
||||
distribution_inputs,
|
||||
action_dist_class,
|
||||
model=None,
|
||||
explore=True,
|
||||
timestep=None):
|
||||
# Adds IID Gaussian noise for exploration, TD3-style.
|
||||
action_dist = action_dist_class(distribution_inputs, model)
|
||||
|
||||
if self.framework == "torch":
|
||||
return self._get_torch_exploration_action(action_dist, explore,
|
||||
timestep)
|
||||
else:
|
||||
return self._get_tf_exploration_action_op(action_dist, explore,
|
||||
timestep)
|
||||
|
||||
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
|
||||
ts = timestep if timestep is not None else self.last_timestep
|
||||
|
||||
# The deterministic actions (if explore=False).
|
||||
deterministic_actions = action_dist.deterministic_sample()
|
||||
|
||||
# Take a Gaussian sample with our stddev (mean=0.0) and scale it.
|
||||
gaussian_sample = self.scale_schedule(ts) * tf.random_normal(
|
||||
tf.shape(deterministic_actions), stddev=self.stddev)
|
||||
|
||||
# Stochastic actions could either be: random OR action + noise.
|
||||
random_actions, _ = \
|
||||
self.random_exploration.get_tf_exploration_action_op(
|
||||
action_dist, explore)
|
||||
stochastic_actions = tf.cond(
|
||||
pred=ts <= self.random_timesteps,
|
||||
true_fn=lambda: random_actions,
|
||||
false_fn=lambda: tf.clip_by_value(
|
||||
deterministic_actions + gaussian_sample,
|
||||
self.action_space.low * tf.ones_like(deterministic_actions),
|
||||
self.action_space.high * tf.ones_like(deterministic_actions))
|
||||
)
|
||||
|
||||
# Chose by `explore` (main exploration switch).
|
||||
batch_size = tf.shape(deterministic_actions)[0]
|
||||
action = tf.cond(
|
||||
pred=tf.constant(explore, dtype=tf.bool)
|
||||
if isinstance(explore, bool) else explore,
|
||||
true_fn=lambda: stochastic_actions,
|
||||
false_fn=lambda: deterministic_actions)
|
||||
# Logp=always zero.
|
||||
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
|
||||
|
||||
# Increment `last_timestep` by 1 (or set to `timestep`).
|
||||
assign_op = \
|
||||
tf.assign_add(self.last_timestep, 1) if timestep is None else \
|
||||
tf.assign(self.last_timestep, timestep)
|
||||
with tf.control_dependencies([assign_op]):
|
||||
return action, logp
|
||||
|
||||
def _get_torch_exploration_action(self, action_dist, explore, timestep):
|
||||
# Set last timestep or (if not given) increase by one.
|
||||
self.last_timestep = timestep if timestep is not None else \
|
||||
self.last_timestep + 1
|
||||
|
||||
# Apply exploration.
|
||||
if explore:
|
||||
# Random exploration phase.
|
||||
if self.last_timestep <= self.random_timesteps:
|
||||
action, _ = \
|
||||
self.random_exploration.get_torch_exploration_action(
|
||||
action_dist, True)
|
||||
# Take a Gaussian sample with our stddev (mean=0.0) and scale it.
|
||||
else:
|
||||
det_actions = action_dist.deterministic_sample()
|
||||
scale = self.scale_schedule(self.last_timestep)
|
||||
gaussian_sample = scale * torch.normal(
|
||||
mean=0.0, stddev=self.stddev, size=det_actions.size())
|
||||
action = torch.clamp(
|
||||
det_actions + gaussian_sample,
|
||||
self.action_space.low * torch.ones_like(det_actions),
|
||||
self.action_space.high * torch.ones_like(det_actions))
|
||||
# No exploration -> Return deterministic actions.
|
||||
else:
|
||||
action = action_dist.deterministic_sample()
|
||||
|
||||
# Logp=always zero.
|
||||
logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
|
||||
|
||||
return action, logp
|
||||
|
||||
@override(Exploration)
|
||||
def get_info(self):
|
||||
"""Returns the current scale value.
|
||||
|
||||
Returns:
|
||||
Union[float,tf.Tensor[float]]: The current scale value.
|
||||
"""
|
||||
return self.scale_schedule(self.last_timestep)
|
165
rllib/utils/exploration/ornstein_uhlenbeck_noise.py
Normal file
165
rllib/utils/exploration/ornstein_uhlenbeck_noise.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
|
||||
get_variable
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
|
||||
class OrnsteinUhlenbeckNoise(GaussianNoise):
|
||||
"""An exploration that adds Ornstein-Uhlenbeck noise to continuous actions.
|
||||
|
||||
If explore=True, returns sampled actions plus a noise term X,
|
||||
which changes according to this formula:
|
||||
Xt+1 = -theta*Xt + sigma*N[0,stddev], where theta, sigma and stddev are
|
||||
constants. Also, some completely random period is possible at the
|
||||
beginning.
|
||||
If explore=False, returns the deterministic action.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
*,
|
||||
ou_theta=0.15,
|
||||
ou_sigma=0.2,
|
||||
ou_base_scale=0.1,
|
||||
random_timesteps=1000,
|
||||
initial_scale=1.0,
|
||||
final_scale=0.02,
|
||||
scale_timesteps=10000,
|
||||
scale_schedule=None,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""Initializes an Ornstein-Uhlenbeck Exploration object.
|
||||
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
ou_theta (float): The theta parameter of the Ornstein-Uhlenbeck
|
||||
process.
|
||||
ou_sigma (float): The sigma parameter of the Ornstein-Uhlenbeck
|
||||
process.
|
||||
ou_base_scale (float): A fixed scaling factor, by which all OU-
|
||||
noise is multiplied. NOTE: This is on top of the parent
|
||||
GaussianNoise's scaling.
|
||||
random_timesteps (int): The number of timesteps for which to act
|
||||
completely randomly. Only after this number of timesteps, the
|
||||
`self.scale` annealing process will start (see below).
|
||||
initial_scale (float): The initial scaling weight to multiply
|
||||
the noise with.
|
||||
final_scale (float): The final scaling weight to multiply
|
||||
the noise with.
|
||||
scale_timesteps (int): The timesteps over which to linearly anneal
|
||||
the scaling factor (after(!) having used random actions for
|
||||
`random_timesteps` steps.
|
||||
scale_schedule (Optional[Schedule]): An optional Schedule object
|
||||
to use (instead of constructing one from the given parameters).
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
super().__init__(
|
||||
action_space,
|
||||
random_timesteps=random_timesteps,
|
||||
initial_scale=initial_scale,
|
||||
final_scale=final_scale,
|
||||
scale_timesteps=scale_timesteps,
|
||||
scale_schedule=scale_schedule,
|
||||
stddev=1.0, # Force `self.stddev` to 1.0.
|
||||
framework=framework,
|
||||
**kwargs)
|
||||
self.ou_theta = ou_theta
|
||||
self.ou_sigma = ou_sigma
|
||||
self.ou_base_scale = ou_base_scale
|
||||
|
||||
# The current OU-state value (gets updated each time, an eploration
|
||||
# action is computed).
|
||||
self.ou_state = get_variable(
|
||||
self.action_space.low.size * [.0],
|
||||
framework=self.framework,
|
||||
tf_name="ou_state")
|
||||
|
||||
@override(GaussianNoise)
|
||||
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
|
||||
ts = timestep if timestep is not None else self.last_timestep
|
||||
scale = self.scale_schedule(ts)
|
||||
|
||||
# The deterministic actions (if explore=False).
|
||||
deterministic_actions = action_dist.deterministic_sample()
|
||||
|
||||
# Apply base-scaled and time-annealed scaled OU-noise to
|
||||
# deterministic actions.
|
||||
gaussian_sample = tf.random_normal(
|
||||
shape=[self.action_space.low.size], stddev=self.stddev)
|
||||
ou_new = self.ou_theta * -self.ou_state + \
|
||||
self.ou_sigma * gaussian_sample
|
||||
ou_state_new = tf.assign_add(self.ou_state, ou_new)
|
||||
noise = scale * self.ou_base_scale * ou_state_new * \
|
||||
(self.action_space.high - self.action_space.low)
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + noise,
|
||||
self.action_space.low * tf.ones_like(deterministic_actions),
|
||||
self.action_space.high * tf.ones_like(deterministic_actions))
|
||||
|
||||
# Stochastic actions could either be: random OR action + noise.
|
||||
random_actions, _ = \
|
||||
self.random_exploration.get_tf_exploration_action_op(
|
||||
action_dist, explore)
|
||||
exploration_actions = tf.cond(
|
||||
pred=ts <= self.random_timesteps,
|
||||
true_fn=lambda: random_actions,
|
||||
false_fn=lambda: stochastic_actions)
|
||||
|
||||
# Chose by `explore` (main exploration switch).
|
||||
action = tf.cond(
|
||||
pred=tf.constant(explore, dtype=tf.bool)
|
||||
if isinstance(explore, bool) else explore,
|
||||
true_fn=lambda: exploration_actions,
|
||||
false_fn=lambda: deterministic_actions)
|
||||
# Logp=always zero.
|
||||
batch_size = tf.shape(deterministic_actions)[0]
|
||||
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
|
||||
|
||||
# Increment `last_timestep` by 1 (or set to `timestep`).
|
||||
assign_op = \
|
||||
tf.assign_add(self.last_timestep, 1) if timestep is None else \
|
||||
tf.assign(self.last_timestep, timestep)
|
||||
with tf.control_dependencies([assign_op, ou_state_new]):
|
||||
return action, logp
|
||||
|
||||
@override(GaussianNoise)
|
||||
def _get_torch_exploration_action(self, action_dist, explore, timestep):
|
||||
# Set last timestep or (if not given) increase by one.
|
||||
self.last_timestep = timestep if timestep is not None else \
|
||||
self.last_timestep + 1
|
||||
|
||||
# Apply exploration.
|
||||
if explore:
|
||||
# Random exploration phase.
|
||||
if self.last_timestep <= self.random_timesteps:
|
||||
action = self.random_exploration.get_torch_exploration_action(
|
||||
action_dist, True)
|
||||
# Apply base-scaled and time-annealed scaled OU-noise to
|
||||
# deterministic actions.
|
||||
else:
|
||||
det_actions = action_dist.deterministic_sample()
|
||||
scale = self.scale_schedule(self.last_timestep)
|
||||
gaussian_sample = scale * torch.normal(
|
||||
mean=0.0, stddev=1.0, size=det_actions.size())
|
||||
|
||||
ou_new = self.ou_theta * -self.ou_state + \
|
||||
self.ou_sigma * gaussian_sample
|
||||
self.ou_state += ou_new
|
||||
noise = scale * self.ou_base_scale * self.ou_state * \
|
||||
(self.action_space.high - self.action_space.low)
|
||||
action = torch.clamp(
|
||||
det_actions + noise,
|
||||
self.action_space.low * torch.ones_like(det_actions),
|
||||
self.action_space.high * torch.ones_like(det_actions))
|
||||
|
||||
# No exploration -> Return deterministic actions.
|
||||
else:
|
||||
action = action_dist.deterministic_sample()
|
||||
|
||||
# Logp=always zero.
|
||||
logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
|
||||
|
||||
return action, logp
|
|
@ -12,19 +12,14 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):
|
|||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
initial_epsilon=1.0,
|
||||
final_epsilon=0.1,
|
||||
epsilon_timesteps=int(1e5),
|
||||
*,
|
||||
num_workers=0,
|
||||
worker_index=0,
|
||||
framework="tf"):
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
initial_epsilon (float): The initial epsilon value to use.
|
||||
final_epsilon (float): The final epsilon value to use.
|
||||
epsilon_timesteps (int): The time step after which epsilon should
|
||||
always be `final_epsilon`.
|
||||
num_workers (Optional[int]): The overall number of workers used.
|
||||
worker_index (Optional[int]): The index of the Worker using this
|
||||
Exploration.
|
||||
|
@ -42,11 +37,7 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):
|
|||
epsilon_schedule = ConstantSchedule(0.0)
|
||||
|
||||
super().__init__(
|
||||
action_space=action_space,
|
||||
initial_epsilon=initial_epsilon,
|
||||
final_epsilon=final_epsilon,
|
||||
epsilon_timesteps=epsilon_timesteps,
|
||||
num_workers=num_workers,
|
||||
worker_index=worker_index,
|
||||
action_space,
|
||||
epsilon_schedule=epsilon_schedule,
|
||||
framework=framework,
|
||||
epsilon_schedule=epsilon_schedule)
|
||||
**kwargs)
|
||||
|
|
43
rllib/utils/exploration/per_worker_gaussian_noise.py
Normal file
43
rllib/utils/exploration/per_worker_gaussian_noise.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
|
||||
from ray.rllib.utils.schedules import ConstantSchedule
|
||||
|
||||
|
||||
class PerWorkerGaussianNoise(GaussianNoise):
|
||||
"""A per-worker Gaussian noise class for distributed algorithms.
|
||||
|
||||
Sets the `scale` schedules of individual workers to a constant:
|
||||
0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
|
||||
See Ape-X paper.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
*,
|
||||
num_workers=0,
|
||||
worker_index=0,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
num_workers (Optional[int]): The overall number of workers used.
|
||||
worker_index (Optional[int]): The index of the Worker using this
|
||||
Exploration.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
scale_schedule = None
|
||||
# Use a fixed, different epsilon per worker. See: Ape-X paper.
|
||||
if num_workers > 0:
|
||||
if worker_index >= 0:
|
||||
exponent = (1 + worker_index / float(num_workers - 1) * 7)
|
||||
scale_schedule = ConstantSchedule(0.4**exponent)
|
||||
# Local worker should have zero exploration so that eval
|
||||
# rollouts run properly.
|
||||
else:
|
||||
scale_schedule = ConstantSchedule(0.0)
|
||||
|
||||
super().__init__(
|
||||
action_space,
|
||||
scale_schedule=scale_schedule,
|
||||
framework=framework,
|
||||
**kwargs)
|
|
@ -0,0 +1,44 @@
|
|||
from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
|
||||
OrnsteinUhlenbeckNoise
|
||||
from ray.rllib.utils.schedules import ConstantSchedule
|
||||
|
||||
|
||||
class PerWorkerOrnsteinUhlenbeckNoise(OrnsteinUhlenbeckNoise):
|
||||
"""A per-worker Ornstein Uhlenbeck noise class for distributed algorithms.
|
||||
|
||||
Sets the Gaussian `scale` schedules of individual workers to a constant:
|
||||
0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
|
||||
See Ape-X paper.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
*,
|
||||
num_workers=0,
|
||||
worker_index=0,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
num_workers (Optional[int]): The overall number of workers used.
|
||||
worker_index (Optional[int]): The index of the Worker using this
|
||||
Exploration.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
scale_schedule = None
|
||||
# Use a fixed, different epsilon per worker. See: Ape-X paper.
|
||||
if num_workers > 0:
|
||||
if worker_index >= 0:
|
||||
exponent = (1 + worker_index / float(num_workers - 1) * 7)
|
||||
scale_schedule = ConstantSchedule(0.4**exponent)
|
||||
# Local worker should have zero exploration so that eval
|
||||
# rollouts run properly.
|
||||
else:
|
||||
scale_schedule = ConstantSchedule(0.0)
|
||||
|
||||
super().__init__(
|
||||
action_space,
|
||||
scale_schedule=scale_schedule,
|
||||
framework=framework,
|
||||
**kwargs)
|
|
@ -1,4 +1,4 @@
|
|||
from gym.spaces import Discrete
|
||||
from gym.spaces import Discrete, MultiDiscrete, Tuple
|
||||
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
|
@ -18,17 +18,24 @@ class Random(Exploration):
|
|||
If explore=False, returns the greedy/max-likelihood action.
|
||||
"""
|
||||
|
||||
def __init__(self, action_space, framework="tf", **kwargs):
|
||||
def __init__(self, action_space, *, framework="tf", **kwargs):
|
||||
"""Initialize a Random Exploration object.
|
||||
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
assert isinstance(action_space, Discrete)
|
||||
super().__init__(
|
||||
action_space=action_space, framework=framework, **kwargs)
|
||||
|
||||
# Determine py_func types, depending on our action-space.
|
||||
if isinstance(self.action_space, (Discrete, MultiDiscrete)) or \
|
||||
(isinstance(self.action_space, Tuple) and
|
||||
isinstance(self.action_space[0], (Discrete, MultiDiscrete))):
|
||||
self.dtype_sample, self.dtype = (tf.int64, tf.int32)
|
||||
else:
|
||||
self.dtype_sample, self.dtype = (tf.float64, tf.float32)
|
||||
|
||||
@override(Exploration)
|
||||
def get_exploration_action(self,
|
||||
distribution_inputs,
|
||||
|
@ -38,23 +45,22 @@ class Random(Exploration):
|
|||
timestep=None):
|
||||
# Instantiate the distribution object.
|
||||
action_dist = action_dist_class(distribution_inputs, model)
|
||||
|
||||
if self.framework == "tf":
|
||||
return self._get_tf_exploration_action_op(action_dist, explore,
|
||||
timestep)
|
||||
return self.get_tf_exploration_action_op(action_dist, explore)
|
||||
else:
|
||||
return self._get_torch_exploration_action(action_dist, explore,
|
||||
timestep)
|
||||
return self.get_torch_exploration_action(action_dist, explore)
|
||||
|
||||
@tf_function(tf)
|
||||
def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
|
||||
def get_tf_exploration_action_op(self, action_dist, explore):
|
||||
if explore:
|
||||
action = tf.py_function(self.action_space.sample, [], tf.int64)
|
||||
action = tf.py_function(self.action_space.sample, [],
|
||||
self.dtype_sample)
|
||||
# Will be unnecessary, once we support batch/time-aware Spaces.
|
||||
action = tf.expand_dims(tf.cast(action, dtype=tf.int32), 0)
|
||||
action = tf.expand_dims(tf.cast(action, dtype=self.dtype), 0)
|
||||
else:
|
||||
action = tf.cast(
|
||||
action_dist.deterministic_sample(), dtype=tf.int32)
|
||||
action_dist.deterministic_sample(), dtype=self.dtype)
|
||||
|
||||
# TODO(sven): Move into (deterministic_)sample(logp=True|False)
|
||||
if isinstance(action, TupleActions):
|
||||
batch_size = tf.shape(action[0][0])[0]
|
||||
|
@ -63,12 +69,15 @@ class Random(Exploration):
|
|||
logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
|
||||
return action, logp
|
||||
|
||||
def _get_torch_exploration_action(self, action_dist, explore, timestep):
|
||||
def get_torch_exploration_action(self, action_dist, explore):
|
||||
tensor_fn = torch.LongTensor if \
|
||||
type(self.action_space) in [Discrete, MultiDiscrete] else \
|
||||
torch.FloatTensor
|
||||
if explore:
|
||||
# Unsqueeze will be unnecessary, once we support batch/time-aware
|
||||
# Spaces.
|
||||
action = torch.LongTensor(self.action_space.sample()).unsqueeze(0)
|
||||
action = tensor_fn(self.action_space.sample()).unsqueeze(0)
|
||||
else:
|
||||
action = torch.LongTensor(action_dist.deterministic_sample())
|
||||
action = tensor_fn(action_dist.deterministic_sample())
|
||||
logp = torch.zeros((action.size()[0], ), dtype=torch.float32)
|
||||
return action, logp
|
||||
|
|
|
@ -10,7 +10,11 @@ class SoftQ(StochasticSampling):
|
|||
output divided by the temperature. Returns the argmax iff explore=False.
|
||||
"""
|
||||
|
||||
def __init__(self, action_space, temperature=1.0, framework="tf",
|
||||
def __init__(self,
|
||||
action_space,
|
||||
*,
|
||||
temperature=1.0,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""Initializes a SoftQ Exploration object.
|
||||
|
||||
|
@ -19,11 +23,10 @@ class SoftQ(StochasticSampling):
|
|||
temperature (Schedule): The temperature to divide model outputs by
|
||||
before creating the Categorical distribution to sample from.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
kwargs (dict): Passed on to super constructor.
|
||||
"""
|
||||
assert isinstance(action_space, Discrete)
|
||||
super().__init__(
|
||||
action_space=action_space,
|
||||
action_space,
|
||||
static_params=dict(temperature=temperature),
|
||||
framework=framework,
|
||||
**kwargs)
|
||||
|
|
|
@ -18,24 +18,24 @@ class StochasticSampling(Exploration):
|
|||
|
||||
def __init__(self,
|
||||
action_space,
|
||||
framework="tf",
|
||||
*,
|
||||
static_params=None,
|
||||
time_dependent_params=None,
|
||||
framework="tf",
|
||||
**kwargs):
|
||||
"""Initializes a StochasticSampling Exploration object.
|
||||
|
||||
Args:
|
||||
action_space (Space): The gym action space used by the environment.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
static_params (Optional[dict]): Parameters to be passed as-is into
|
||||
the action distribution class' constructor.
|
||||
time_dependent_params (dict): Parameters to be evaluated based on
|
||||
`timestep` and then passed into the action distribution
|
||||
class' constructor.
|
||||
framework (Optional[str]): One of None, "tf", "torch".
|
||||
"""
|
||||
assert framework is not None
|
||||
super().__init__(
|
||||
action_space=action_space, framework=framework, **kwargs)
|
||||
super().__init__(action_space, framework=framework, **kwargs)
|
||||
|
||||
self.static_params = static_params or {}
|
||||
|
||||
|
|
|
@ -47,6 +47,6 @@ class PiecewiseSchedule(Schedule):
|
|||
alpha = float(t - l_t) / (r_t - l_t)
|
||||
return self.interpolation(l, r, alpha)
|
||||
|
||||
# t does not belong to any of the pieces, so doom.
|
||||
# t does not belong to any of the pieces, return `self.outside_value`.
|
||||
assert self.outside_value is not None
|
||||
return self.outside_value
|
||||
|
|
|
@ -39,11 +39,11 @@ class Schedule(metaclass=ABCMeta):
|
|||
raise NotImplementedError
|
||||
|
||||
def value(self, t):
|
||||
if self.framework == "tf" and tf.executing_eagerly() is False:
|
||||
if self.framework == "tf":
|
||||
return tf.cast(
|
||||
tf.py_func(self._value, [t], tf.float64),
|
||||
tf.py_function(self._value, [t], tf.float64),
|
||||
tf.float32,
|
||||
name="schedule-value")
|
||||
name="schedule_value")
|
||||
return self._value(t)
|
||||
|
||||
def __call__(self, t):
|
||||
|
|
Loading…
Add table
Reference in a new issue