[RLlib] DDPG refactor and Exploration API action noise classes. (#7314)

* WIP. * WIP. * WIP. * WIP. * WIP. * Fix * WIP. * Add TD3 quick Pendulum regresison. * Cleanup. * Fix. * LINT. * Fix. * Sort quick_learning test cases, add TD3. * Sort quick_learning test cases, add TD3. * Revert test_checkpoint_restore.py (debugging) changes. * Fix old soft_q settings in documentation and test configs. * More doc fixes. * Fix test case. * Fix test case. * Lower test load. * WIP.
2025-03-06 02:21:39 -05:00 · 2020-03-01 20:53:35 +01:00 · 2020-03-01 20:53:35 +01:00 · 83e06cd30a
commit 83e06cd30a
parent 3c6b94f3f5
41 changed files with 1294 additions and 777 deletions
--- a/doc/source/rllib-offline.rst
+++ b/doc/source/rllib-offline.rst
@ -45,8 +45,7 @@ Then, we can tell DQN to train using these previously generated experiences with
        --config='{
            "input": "/tmp/cartpole-out",
            "input_evaluation": [],
-            "exploration_final_eps": 0,
-            "exploration_fraction": 0}'
+            "explore": false}'

 **Off-policy estimation:** Since the input experiences are not from running simulations, RLlib cannot report the true policy performance during training. However, you can use ``tensorboard --logdir=~/ray_results`` to monitor training progress via other metrics such as estimated Q-value. Alternatively, `off-policy estimation <https://arxiv.org/pdf/1511.03722.pdf>`__ can be used, which requires both the source and target action probabilities to be available (i.e., the ``action_prob`` batch key). For DQN, this means enabling soft Q learning so that actions are sampled from a probability distribution:

@ -58,8 +57,10 @@ Then, we can tell DQN to train using these previously generated experiences with
        --config='{
            "input": "/tmp/cartpole-out",
            "input_evaluation": ["is", "wis"],
-            "soft_q": true,
-            "softmax_temp": 1.0}'
+            "exploration_config": {
+                "type": "SoftQ",
+                "temperature": 1.0,
+            }'

 This example plot shows the Q-value metric in addition to importance sampling (IS) and weighted importance sampling (WIS) gain estimates (>1.0 means there is an estimated improvement over the original policy):

@ -121,8 +122,7 @@ RLlib supports multiplexing inputs from multiple input sources, including simula
                "hdfs:/archive/cartpole": 0.3,
                "sampler": 0.3,
            },
-            "exploration_final_eps": 0,
-            "exploration_fraction": 0}'
+            "explore": false}'

 Scaling I/O throughput
 ~~~~~~~~~~~~~~~~~~~~~~
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -58,6 +58,22 @@ py_test(
 # Tag: agents_dir
 # --------------------------------------------------------------------

+# A2CTrainer
+py_test(
+    name = "test_a2c",
+    tags = ["agents_dir"],
+    size = "small",
+    srcs = ["agents/a3c/tests/test_a2c.py"]
+)
+
+# DDPGTrainer
+py_test(
+    name = "test_ddpg",
+    tags = ["agents_dir"],
+    size = "medium",
+    srcs = ["agents/ddpg/tests/test_ddpg.py"]
+)
+
 # DQNTrainer
 py_test(
    name = "test_dqn",
@ -66,12 +82,12 @@ py_test(
    srcs = ["agents/dqn/tests/test_dqn.py"]
 )

-# A2CTrainer
+# IMPALA
 py_test(
-    name = "test_a2c",
+    name = "test_vtrace",
    tags = ["agents_dir"],
    size = "small",
-    srcs = ["agents/a3c/tests/test_a2c.py"]
+    srcs = ["agents/impala/tests/test_vtrace.py"]
 )

 # PGTrainer
@ -91,12 +107,12 @@ py_test(
            "agents/ppo/tests/test.py"]  # TODO(sven): Move down once PR 6889 merged
 )

-# IMPALA
+# TD3Trainer
 py_test(
-    name = "test_vtrace",
+    name = "test_td3",
    tags = ["agents_dir"],
-    size = "small",
-    srcs = ["agents/impala/tests/test_vtrace.py"]
+    size = "medium",
+    srcs = ["agents/ddpg/tests/test_td3.py"]
 )

 # --------------------------------------------------------------------
@ -255,6 +271,390 @@ py_test(
        ]
 )

+
+# DDPG/APEX-DDPG/TD3
+
+py_test(
+    name = "test_ddpg_pendulum_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pendulum-v0",
+        "--run", "DDPG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "MountainCarContinuous-v0",
+        "--run", "DDPG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 0}'"
+        ]
+)
+
+py_test(
+    name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "MountainCarContinuous-v0",
+        "--run", "DDPG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_apex_ddpg_pendulum_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pendulum-v0",
+        "--run", "APEX_DDPG",
+        "--ray-num-cpus", "8",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
+        "--ray-num-cpus", "4"
+        ]
+)
+
+py_test(
+    name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pendulum-v0",
+        "--run", "APEX_DDPG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
+        "--ray-num-cpus", "4",
+        ]
+)
+
+py_test(
+    name = "test_td3_pendulum_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pendulum-v0",
+        "--run", "TD3",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 1}'"
+        ]
+)
+
+# DQN/APEX
+
+py_test(
+    name = "test_dqn_frozenlake_v0",
+    main = "train.py", srcs = ["train.py"],
+    size = "small",
+    tags = ["quick_train"],
+    args = [
+        "--env", "FrozenLake-v0",
+        "--run", "DQN",
+        "--stop", "'{\"training_iteration\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_dqn_cartpole_v0_no_dueling",
+    main = "train.py", srcs = ["train.py"],
+    size = "small",
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "DQN",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"lr\": 1e-3, \"exploration_config\": {\"epsilon_timesteps\": 10000, \"final_epsilon\": 0.02}, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
+        ]
+)
+
+py_test(
+    name = "test_dqn_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "DQN",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 2}'",
+        "--ray-num-cpus", "4"
+        ]
+)
+
+py_test(
+    name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train", "external_files"],
+    size = "small",
+    # Include the json data file.
+    data = glob(["tests/data/cartpole_small/**"]),
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "DQN",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"exploration_config\": {\"type\": \"SoftQ\"}}'"
+        ]
+)
+
+py_test(
+    name = "test_dqn_pong_deterministic_v4",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "PongDeterministic-v4",
+        "--run", "DQN",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"lr\": 1e-4, \"exploration_config\": {\"epsilon_timesteps\": 200000, \"final_epsilon\": 0.01}, \"buffer_size\": 10000, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
+        ]
+)
+
+py_test(
+    name = "test_apex_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "APEX",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
+        "--ray-num-cpus", "4"
+        ]
+)
+
+# ES
+
+py_test(
+    name = "test_es_pendulum_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pendulum-v0",
+        "--run", "ES",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
+        "--ray-num-cpus", "4"
+        ]
+)
+
+py_test(
+    name = "test_es_pong_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pong-v0",
+        "--run", "ES",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
+        "--ray-num-cpus", "4"
+        ]
+)
+
+# IMPALA
+
+py_test(
+    name = "test_impala_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "IMPALA",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
+        "--ray-num-cpus", "4",
+        ]
+)
+
+py_test(
+    name = "test_impala_cartpole_v0_num_aggregation_workers_2",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "IMPALA",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
+        "--ray-num-cpus", "5",
+        ]
+)
+
+py_test(
+    name = "test_impala_cartpole_v0_lstm",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "IMPALA",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
+        "--ray-num-cpus", "4",
+        ]
+)
+
+py_test(
+    name = "test_impala_buffers_2",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "IMPALA",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
+        "--ray-num-cpus", "4",
+        ]
+)
+
+py_test(
+    name = "test_impala_cartpole_v0_buffers_2_lstm",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "IMPALA",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
+        "--ray-num-cpus", "4",
+        ]
+)
+
+py_test(
+    name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "PongDeterministic-v4",
+        "--run", "IMPALA",
+        "--stop", "'{\"timesteps_total\": 40000}'",
+        "--ray-object-store-memory=1000000000",
+        "--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
+        ]
+)
+
+# From test_rollout.sh (deprecated test file).
+py_test(
+    name = "test_impala_rollout",
+    main = "tests/test_rollout.py",
+    data = ["train.py", "rollout.py"],
+    tags = ["quick_train"],
+    srcs = ["tests/test_rollout.py"]
+)
+
+# MARWIL
+
+py_test(
+    name = "test_marwil_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train", "external_files"],
+    size = "small",
+    # Include the json data file.
+    data = glob(["tests/data/cartpole_small/**"]),
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "MARWIL",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
+        ]
+)
+
+# PG
+
+py_test(
+    name = "test_pg_tf_frozenlake_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "FrozenLake-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_pg_torch_frozenlake_v0",
+    main = "train.py", srcs = ["train.py"],
+    size = "small",
+    tags = ["quick_train"],
+    args = [
+        "--torch",
+        "--env", "FrozenLake-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_pg_tf_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    size = "small",
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
+        ]
+)
+
+py_test(
+    name = "test_pg_torch_cartpole_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--torch",
+        "--env", "CartPole-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500}'"
+        ]
+)
+
+py_test(
+    name = "test_pg_tf_cartpole_v0_lstm",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
+        ]
+)
+
+py_test(
+    name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
+    main = "train.py", srcs = ["train.py"],
+    size = "small",
+    tags = ["quick_train"],
+    args = [
+        "--env", "CartPole-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
+        ]
+)
+
+
+py_test(
+    name = "test_pg_tf_pong_v0",
+    main = "train.py", srcs = ["train.py"],
+    tags = ["quick_train"],
+    args = [
+        "--env", "Pong-v0",
+        "--run", "PG",
+        "--stop", "'{\"training_iteration\": 1}'",
+        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
+        ]
+)
+
 # PPO/APPO

 py_test(
@ -424,378 +824,6 @@ py_test(
        ]
 )

-# ES
-
-py_test(
-    name = "test_es_pendulum_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pendulum-v0",
-        "--run", "ES",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
-        "--ray-num-cpus", "4"
-        ]
-)
-
-py_test(
-    name = "test_es_pong_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pong-v0",
-        "--run", "ES",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"stepsize\": 0.01, \"episodes_per_batch\": 20, \"train_batch_size\": 100, \"num_workers\": 2}'",
-        "--ray-num-cpus", "4"
-        ]
-)
-
-# DQN/APEX
-
-py_test(
-    name = "test_dqn_frozenlake_v0",
-    main = "train.py", srcs = ["train.py"],
-    size = "small",
-    tags = ["quick_train"],
-    args = [
-        "--env", "FrozenLake-v0",
-        "--run", "DQN",
-        "--stop", "'{\"training_iteration\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_dqn_cartpole_v0_no_dueling",
-    main = "train.py", srcs = ["train.py"],
-    size = "small",
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "DQN",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"lr\": 1e-3, \"schedule_max_timesteps\": 100000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.02, \"dueling\": false, \"hiddens\": [], \"model\": {\"fcnet_hiddens\": [64], \"fcnet_activation\": \"relu\"}}'"
-        ]
-)
-
-py_test(
-    name = "test_dqn_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "DQN",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 2}'",
-        "--ray-num-cpus", "4"
-        ]
-)
-
-py_test(
-    name = "test_dqn_cartpole_v0_with_offline_input_and_softq",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train", "external_files"],
-    size = "small",
-    # Include the json data file.
-    data = glob(["tests/data/cartpole_small/**"]),
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "DQN",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"soft_q\": true}'"
-        ]
-)
-
-py_test(
-    name = "test_dqn_pong_deterministic_v4",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "PongDeterministic-v4",
-        "--run", "DQN",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"lr\": 1e-4, \"schedule_max_timesteps\": 2000000, \"buffer_size\": 10000, \"exploration_fraction\": 0.1, \"exploration_final_eps\": 0.01, \"sample_batch_size\": 4, \"learning_starts\": 10000, \"target_network_update_freq\": 1000, \"gamma\": 0.99, \"prioritized_replay\": true}'"
-        ]
-)
-
-py_test(
-    name = "test_apex_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "APEX",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 2, \"timesteps_per_iteration\": 1000, \"num_gpus\": 0, \"min_iter_time_s\": 1}'",
-        "--ray-num-cpus", "4"
-        ]
-)
-
-# PG
-
-py_test(
-    name = "test_pg_tf_frozenlake_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "FrozenLake-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_pg_torch_frozenlake_v0",
-    main = "train.py", srcs = ["train.py"],
-    size = "small",
-    tags = ["quick_train"],
-    args = [
-        "--torch",
-        "--env", "FrozenLake-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_pg_tf_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    size = "small",
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_pg_torch_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--torch",
-        "--env", "CartPole-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500}'"
-        ]
-)
-
-py_test(
-    name = "test_pg_tf_cartpole_v0_lstm",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"model\": {\"use_lstm\": true, \"max_seq_len\": 100}}'"
-        ]
-)
-
-py_test(
-    name = "test_pg_tf_cartpole_v0_multi_envs_per_worker",
-    main = "train.py", srcs = ["train.py"],
-    size = "small",
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1, \"num_envs_per_worker\": 10}'"
-        ]
-)
-
-
-py_test(
-    name = "test_pg_tf_pong_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pong-v0",
-        "--run", "PG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"sample_batch_size\": 500, \"num_workers\": 1}'"
-        ]
-)
-
-# DDPG/APEX-DDPG
-
-py_test(
-    name = "test_ddpg_pendulum_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pendulum-v0",
-        "--run", "DDPG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_ddpg_mountaincar_continuous_v0_num_workers_0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "MountainCarContinuous-v0",
-        "--run", "DDPG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 0}'"
-        ]
-)
-
-
-py_test(
-    name = "test_ddpg_mountaincar_continuous_v0_num_workers_1",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "MountainCarContinuous-v0",
-        "--run", "DDPG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 1}'"
-        ]
-)
-
-py_test(
-    name = "test_apex_ddpg_pendulum_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pendulum-v0",
-        "--run", "APEX_DDPG",
-        "--ray-num-cpus", "8",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1}'",
-        "--ray-num-cpus", "4"
-        ]
-)
-
-py_test(
-    name = "test_apex_ddpg_pendulum_v0_complete_episode_batches",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "Pendulum-v0",
-        "--run", "APEX_DDPG",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\", \"parameter_noise\": false}'",
-        "--ray-num-cpus", "4",
-        ]
-)
-
-# IMPALA
-
-py_test(
-    name = "test_impala_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "IMPALA",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1}'",
-        "--ray-num-cpus", "4",
-        ]
-)
-
-py_test(
-    name = "test_impala_cartpole_v0_num_aggregation_workers_2",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "IMPALA",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"num_aggregation_workers\": 2, \"min_iter_time_s\": 1}'",
-        "--ray-num-cpus", "5",
-        ]
-)
-
-py_test(
-    name = "test_impala_cartpole_v0_lstm",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "IMPALA",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"model\": {\"use_lstm\": true}}'",
-        "--ray-num-cpus", "4",
-        ]
-)
-
-py_test(
-    name = "test_impala_buffers_2",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "IMPALA",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
-        "--ray-num-cpus", "4",
-        ]
-)
-
-py_test(
-    name = "test_impala_cartpole_v0_buffers_2_lstm",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "IMPALA",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_data_loader_buffers\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
-        "--ray-num-cpus", "4",
-        ]
-)
-
-py_test(
-    name = "test_impala_pong_deterministic_v4_40k_ts_1G_obj_store",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train"],
-    args = [
-        "--env", "PongDeterministic-v4",
-        "--run", "IMPALA",
-        "--stop", "'{\"timesteps_total\": 40000}'",
-        "--ray-object-store-memory=1000000000",
-        "--config", "'{\"num_workers\": 1, \"num_gpus\": 0, \"num_envs_per_worker\": 32, \"sample_batch_size\": 50, \"train_batch_size\": 50, \"learner_queue_size\": 1}'"
-        ]
-)
-
-# From test_rollout.sh (deprecated test file).
-py_test(
-    name = "test_impala_rollout",
-    main = "tests/test_rollout.py",
-    data = ["train.py", "rollout.py"],
-    tags = ["quick_train"],
-    srcs = ["tests/test_rollout.py"]
-)
-
-# MARWIL
-
-py_test(
-    name = "test_marwil_cartpole_v0",
-    main = "train.py", srcs = ["train.py"],
-    tags = ["quick_train", "external_files"],
-    size = "small",
-    # Include the json data file.
-    data = glob(["tests/data/cartpole_small/**"]),
-    args = [
-        "--env", "CartPole-v0",
-        "--run", "MARWIL",
-        "--stop", "'{\"training_iteration\": 1}'",
-        "--config", "'{\"input\": \"tests/data/cartpole_small\", \"learning_starts\": 0, \"input_evaluation\": [\"wis\", \"is\"], \"shuffle_buffer_size\": 10}'"
-        ]
-)
-
 # --------------------------------------------------------------------
 # Models and Distributions
 # rllib/models/
--- a/rllib/agents/ddpg/init.py
+++ b/rllib/agents/ddpg/init.py
@ -1,12 +1,5 @@
 from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ddpg.td3 import TD3Trainer
-from ray.rllib.utils import renamed_agent

-ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
-DDPGAgent = renamed_agent(DDPGTrainer)
-
-__all__ = [
-    "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
-    "TD3Trainer", "DEFAULT_CONFIG"
-]
+__all__ = ["ApexDDPGTrainer", "DDPGTrainer", "DEFAULT_CONFIG", "TD3Trainer"]
--- a/rllib/agents/ddpg/apex.py
+++ b/rllib/agents/ddpg/apex.py
@ -1,17 +1,18 @@
 from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
    DEFAULT_CONFIG as DDPG_CONFIG
-from ray.rllib.utils import merge_dicts

-APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
+APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
    {
-        "optimizer": merge_dicts(
-            DDPG_CONFIG["optimizer"], {
-                "max_weight_sync_delay": 400,
-                "num_replay_buffer_shards": 4,
-                "debug": False
-            }),
+        "optimizer": {
+            "max_weight_sync_delay": 400,
+            "num_replay_buffer_shards": 4,
+            "debug": False
+        },
+        "exploration_config": {
+            "type": "PerWorkerOrnsteinUhlenbeckNoise"
+        },
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
@ -21,7 +22,6 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
        "sample_batch_size": 50,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 25000,
-        "per_worker_exploration": True,
        "worker_side_prioritization": True,
        "min_iter_time_s": 30,
    },
--- a/rllib/agents/ddpg/ddpg.py
+++ b/rllib/agents/ddpg/ddpg.py
@ -1,7 +1,10 @@
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
 from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
-from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
+from ray.rllib.utils.deprecation import deprecation_warning, \
+    DEPRECATED_VALUE
+from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
+    PerWorkerOrnsteinUhlenbeckNoise

 # yapf: disable
 # __sphinx_doc_begin__
@ -55,49 +58,35 @@ DEFAULT_CONFIG = with_common_config({
    "n_step": 1,

    # === Exploration ===
-    # Turns on annealing schedule for exploration noise. Exploration is
-    # annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
-    # scaled by exploration_fraction. Original DDPG and TD3 papers do not
-    # anneal noise, so this is False by default.
-    "exploration_should_anneal": False,
-    # Max num timesteps for annealing schedules.
-    "schedule_max_timesteps": 100000,
+    "exploration_config": {
+        # DDPG uses OrnsteinUhlenbeck (stateful) noise to be added to NN-output
+        # actions (after a possible pure random phase of n timesteps).
+        "type": "OrnsteinUhlenbeckNoise",
+        # For how many timesteps should we return completely random actions,
+        # before we start adding (scaled) noise?
+        "random_timesteps": 1000,
+        # The OU-base scaling factor to always apply to action-added noise.
+        "ou_base_scale": 0.1,
+        # The OU theta param.
+        "ou_theta": 0.15,
+        # The OU sigma param.
+        "ou_sigma": 0.2,
+        # The initial noise scaling factor.
+        "initial_scale": 1.0,
+        # The final noise scaling factor.
+        "final_scale": 1.0,
+        # Timesteps over which to anneal scale (from initial to final values).
+        "scale_timesteps": 10000,
+    },
    # Number of env steps to optimize for before returning
    "timesteps_per_iteration": 1000,
-    # Fraction of entire training period over which the exploration rate is
-    # annealed
-    "exploration_fraction": 0.1,
-    # Final scaling multiplier for action noise (initial is 1.0)
-    "exploration_final_scale": 0.02,
-    # valid values: "ou" (time-correlated, like original DDPG paper),
-    # "gaussian" (IID, like TD3 paper)
-    "exploration_noise_type": "ou",
-    # OU-noise scale; this can be used to scale down magnitude of OU noise
-    # before adding to actions (requires "exploration_noise_type" to be "ou")
-    "exploration_ou_noise_scale": 0.1,
-    # theta for OU
-    "exploration_ou_theta": 0.15,
-    # sigma for OU
-    "exploration_ou_sigma": 0.2,
-    # gaussian stddev of act noise for exploration (requires
-    # "exploration_noise_type" to be "gaussian")
-    "exploration_gaussian_sigma": 0.1,
    # If True parameter space noise will be used for exploration
    # See https://blog.openai.com/better-exploration-with-parameter-noise/
    "parameter_noise": False,
-    # Until this many timesteps have elapsed, the agent's policy will be
-    # ignored & it will instead take uniform random actions. Can be used in
-    # conjunction with learning_starts (which controls when the first
-    # optimization step happens) to decrease dependence of exploration &
-    # optimization on initial policy parameters. Note that this will be
-    # disabled when the action noise scale is set to 0 (e.g during evaluation).
-    "pure_exploration_steps": 1000,
    # Extra configuration that disables exploration.
    "evaluation_config": {
-        "exploration_fraction": 0,
-        "exploration_final_eps": 0,
+        "explore": False
    },
-
    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
@ -150,8 +139,6 @@ DEFAULT_CONFIG = with_common_config({
    # to increase if your environment is particularly slow to sample, or if
    # you're using the Async or Ape-X optimizers.
    "num_workers": 0,
-    # Whether to use a distribution of epsilons across workers for exploration.
-    "per_worker_exploration": False,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent iterations from going lower than this time span
@ -161,76 +148,47 @@ DEFAULT_CONFIG = with_common_config({
 # yapf: enable


-def make_exploration_schedule(config, worker_index):
-    # Modification of DQN's schedule to take into account
-    # `exploration_ou_noise_scale`
-    if config["per_worker_exploration"]:
-        assert config["num_workers"] > 1, "This requires multiple workers"
-        if worker_index >= 0:
-            # FIXME: what do magic constants mean? (0.4, 7)
-            max_index = float(config["num_workers"] - 1)
-            exponent = 1 + worker_index / max_index * 7
-            return ConstantSchedule(0.4**exponent)
-        else:
-            # local ev should have zero exploration so that eval rollouts
-            # run properly
-            return ConstantSchedule(0.0)
-    elif config["exploration_should_anneal"]:
-        return PiecewiseSchedule(
-            endpoints=[(0, 1.0), (int(config["exploration_fraction"] *
-                                      config["schedule_max_timesteps"]),
-                                  config["exploration_final_scale"])],
-            outside_value=config["exploration_final_scale"])
-    else:
-        # *always* add exploration noise
-        return ConstantSchedule(1.0)
-
-
-def setup_ddpg_exploration(trainer):
-    trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
-    trainer.explorations = [
-        make_exploration_schedule(trainer.config, i)
-        for i in range(trainer.config["num_workers"])
-    ]
-
-
-def update_worker_explorations(trainer):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    exp_vals = [trainer.exploration0.value(global_timestep)]
-    trainer.workers.local_worker().foreach_trainable_policy(
-        lambda p, _: p.set_epsilon(exp_vals[0]))
-    for i, e in enumerate(trainer.workers.remote_workers()):
-        exp_val = trainer.explorations[i].value(global_timestep)
-        e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
-        exp_vals.append(exp_val)
-    trainer.train_start_timestep = global_timestep
-    trainer.exploration_infos = exp_vals
-
-
-def add_pure_exploration_phase(trainer):
-    global_timestep = trainer.optimizer.num_steps_sampled
-    pure_expl_steps = trainer.config["pure_exploration_steps"]
-    if pure_expl_steps:
-        # tell workers whether they should do pure exploration
-        only_explore = global_timestep < pure_expl_steps
-        trainer.workers.local_worker().foreach_trainable_policy(
-            lambda p, _: p.set_pure_exploration_phase(only_explore))
-        for e in trainer.workers.remote_workers():
-            e.foreach_trainable_policy.remote(
-                lambda p, _: p.set_pure_exploration_phase(only_explore))
-    update_worker_explorations(trainer)
-
-
 def validate_config(config):
    # PyTorch check.
    if config["use_pytorch"]:
        raise ValueError("DDPG does not support PyTorch yet! Use tf instead.")

+    # TODO(sven): Remove at some point.
+    #  Backward compatibility of noise-based exploration config.
+    schedule_max_timesteps = None
+    if config.get("schedule_max_timesteps", DEPRECATED_VALUE) != \
+            DEPRECATED_VALUE:
+        deprecation_warning("schedule_max_timesteps",
+                            "exploration_config.scale_timesteps")
+        schedule_max_timesteps = config["schedule_max_timesteps"]
+    if config.get("exploration_final_scale", DEPRECATED_VALUE) != \
+            DEPRECATED_VALUE:
+        deprecation_warning("exploration_final_scale",
+                            "exploration_config.final_scale")
+        if isinstance(config["exploration_config"], dict):
+            config["exploration_config"]["final_scale"] = \
+                config.pop("exploration_final_scale")
+    if config.get("exploration_fraction", DEPRECATED_VALUE) != \
+            DEPRECATED_VALUE:
+        assert schedule_max_timesteps is not None
+        deprecation_warning("exploration_fraction",
+                            "exploration_config.scale_timesteps")
+        if isinstance(config["exploration_config"], dict):
+            config["exploration_config"]["scale_timesteps"] = config.pop(
+                "exploration_fraction") * schedule_max_timesteps
+    if config.get("per_worker_exploration", DEPRECATED_VALUE) != \
+            DEPRECATED_VALUE:
+        deprecation_warning(
+            "per_worker_exploration",
+            "exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise")
+        if isinstance(config["exploration_config"], dict):
+            config["exploration_config"]["type"] = \
+                PerWorkerOrnsteinUhlenbeckNoise
+

 DDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="DDPG",
    default_config=DEFAULT_CONFIG,
    default_policy=DDPGTFPolicy,
    validate_config=validate_config,
-    before_init=setup_ddpg_exploration,
-    before_train_step=add_pure_exploration_phase)
+)
--- a/rllib/agents/ddpg/ddpg_policy.py
+++ b/rllib/agents/ddpg/ddpg_policy.py
@ -7,6 +7,7 @@ from ray.rllib.agents.dqn.dqn_policy import postprocess_nstep_and_prio
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
 from ray.rllib.models import ModelCatalog
+from ray.rllib.models.tf.tf_action_dist import Deterministic
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.policy.policy import Policy
@ -42,20 +43,24 @@ class DDPGPostprocessing:
                list(x) for x in sample_batch.columns(
                    [SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
            ]
-            self.sess.run(self.remove_noise_op)
-            clean_actions = self.sess.run(
-                self.output_actions,
+            self.sess.run(self.remove_parameter_noise_op)
+
+            # TODO(sven): This won't work if exploration != Noise, which is
+            #  probably fine as parameter_noise will soon be its own
+            #  Exploration class.
+            clean_actions, cur_noise_scale = self.sess.run(
+                [self.output_actions,
+                 self.exploration.get_info()],
                feed_dict={
                    self.cur_observations: states,
-                    self.stochastic: False,
-                    self.noise_scale: .0,
-                    self.pure_exploration_phase: False,
+                    self._is_exploring: False,
                })
            distance_in_action_space = np.sqrt(
                np.mean(np.square(clean_actions - noisy_actions)))
            self.pi_distance = distance_in_action_space
            if distance_in_action_space < \
-                    self.config["exploration_ou_sigma"] * self.cur_noise_scale:
+                    self.config["exploration_config"].get("ou_sigma", 0.2) * \
+                    cur_noise_scale:
                # multiplying the sampled OU noise by noise scale is
                # equivalent to multiplying the sigma of OU by noise scale
                self.parameter_noise_sigma_val *= 1.01
@ -82,14 +87,11 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
                "using a Tuple action space, or the multi-agent API.")

        self.config = config
-        self.cur_noise_scale = 1.0
-        self.cur_pure_exploration_phase = False
-        self.dim_actions = action_space.shape[0]
-        self.low_action = action_space.low
-        self.high_action = action_space.high

-        # create global step for counting the number of update operations
+        # Create global step for counting the number of update operations.
        self.global_step = tf.train.get_or_create_global_step()
+        # Create sampling timestep placeholder.
+        timestep = tf.placeholder(tf.int32, (), name="timestep")

        # use separate optimizers for actor & critic
        self._actor_optimizer = tf.train.AdamOptimizer(
@ -97,11 +99,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
        self._critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=self.config["critic_lr"])

-        # Action inputs
-        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
-        self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
-        self.pure_exploration_phase = tf.placeholder(
-            tf.bool, (), name="pure_exploration_phase")
+        # Observation inputs.
        self.cur_observations = tf.placeholder(
            tf.float32,
            shape=(None, ) + observation_space.shape,
@ -118,19 +116,14 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
                var for var in self.policy_vars if "LayerNorm" not in var.name
            ])

+        # Create exploration component.
+        self.exploration = self._create_exploration(action_space, config)
+        explore = tf.placeholder_with_default(True, (), name="is_exploring")
        # Action outputs
        with tf.variable_scope(ACTION_SCOPE):
-            self.output_actions = self._add_exploration_noise(
-                policy_out, self.stochastic, self.noise_scale,
-                self.pure_exploration_phase, action_space)
-
-        if self.config["smooth_target_policy"]:
-            self.reset_noise_op = tf.no_op()
-        else:
-            with tf.variable_scope(ACTION_SCOPE, reuse=True):
-                exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
-                self.reset_noise_op = tf.assign(exploration_sample,
-                                                self.dim_actions * [.0])
+            self.output_actions, _ = self.exploration.get_exploration_action(
+                policy_out, Deterministic, self.policy_model, explore,
+                timestep)

        # Replay inputs
        self.obs_t = tf.placeholder(
@ -294,7 +287,9 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
            sampled_action=self.output_actions,
            loss=self.actor_loss + self.critic_loss,
            loss_inputs=self.loss_inputs,
-            update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops)
+            update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops,
+            explore=explore,
+            timestep=timestep)
        self.sess.run(tf.global_variables_initializer())

        # Note that this encompasses both the policy and Q-value networks and
@ -364,16 +359,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
            + self._critic_grads_and_vars
        return grads_and_vars

-    @override(TFPolicy)
-    def extra_compute_action_feed_dict(self):
-        return {
-            # FIXME: what about turning off exploration? Isn't that a good
-            # idea?
-            self.stochastic: True,
-            self.noise_scale: self.cur_noise_scale,
-            self.pure_exploration_phase: self.cur_pure_exploration_phase,
-        }
-
    @override(TFPolicy)
    def extra_compute_grad_fetches(self):
        return {
@ -389,19 +374,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
    def set_weights(self, weights):
        self.variables.set_weights(weights)

-    @override(Policy)
-    def get_state(self):
-        return [
-            TFPolicy.get_state(self), self.cur_noise_scale,
-            self.cur_pure_exploration_phase
-        ]
-
-    @override(Policy)
-    def set_state(self, state):
-        TFPolicy.set_state(self, state[0])
-        self.set_epsilon(state[1])
-        self.set_pure_exploration_phase(state[2])
-
    def _build_q_network(self, obs, obs_space, action_space, actions):
        if self.config["use_state_preprocessor"]:
            q_model = ModelCatalog.get_model({
@ -444,7 +416,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
                action_out = tf.layers.dense(
                    action_out, units=hidden, activation=activation)
        action_out = tf.layers.dense(
-            action_out, units=self.dim_actions, activation=None)
+            action_out, units=action_space.shape[0], activation=None)

        # Use sigmoid to scale to [0,1], but also double magnitude of input to
        # emulate behaviour of tanh activation used in DDPG and TD3 papers.
@ -458,81 +430,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):

        return actions, model

-    def _add_exploration_noise(self, deterministic_actions,
-                               should_be_stochastic, noise_scale,
-                               enable_pure_exploration, action_space):
-        noise_type = self.config["exploration_noise_type"]
-        action_low = action_space.low
-        action_high = action_space.high
-        action_range = action_space.high - action_low
-
-        def compute_stochastic_actions():
-            def make_noisy_actions():
-                # shape of deterministic_actions is [None, dim_action]
-                if noise_type == "gaussian":
-                    # add IID Gaussian noise for exploration, TD3-style
-                    normal_sample = noise_scale * tf.random_normal(
-                        tf.shape(deterministic_actions),
-                        stddev=self.config["exploration_gaussian_sigma"])
-                    stochastic_actions = tf.clip_by_value(
-                        deterministic_actions + normal_sample,
-                        action_low * tf.ones_like(deterministic_actions),
-                        action_high * tf.ones_like(deterministic_actions))
-                elif noise_type == "ou":
-                    # add OU noise for exploration, DDPG-style
-                    zero_acts = action_low.size * [.0]
-                    exploration_sample = tf.get_variable(
-                        name="ornstein_uhlenbeck",
-                        dtype=tf.float32,
-                        initializer=zero_acts,
-                        trainable=False)
-                    normal_sample = tf.random_normal(
-                        shape=[action_low.size], mean=0.0, stddev=1.0)
-                    ou_new = self.config["exploration_ou_theta"] \
-                        * -exploration_sample \
-                        + self.config["exploration_ou_sigma"] * normal_sample
-                    exploration_value = tf.assign_add(exploration_sample,
-                                                      ou_new)
-                    base_scale = self.config["exploration_ou_noise_scale"]
-                    noise = noise_scale * base_scale \
-                        * exploration_value * action_range
-                    stochastic_actions = tf.clip_by_value(
-                        deterministic_actions + noise,
-                        action_low * tf.ones_like(deterministic_actions),
-                        action_high * tf.ones_like(deterministic_actions))
-                else:
-                    raise ValueError(
-                        "Unknown noise type '%s' (try 'ou' or 'gaussian')" %
-                        noise_type)
-                return stochastic_actions
-
-            def make_uniform_random_actions():
-                # pure random exploration option
-                uniform_random_actions = tf.random_uniform(
-                    tf.shape(deterministic_actions))
-                # rescale uniform random actions according to action range
-                tf_range = tf.constant(action_range[None], dtype="float32")
-                tf_low = tf.constant(action_low[None], dtype="float32")
-                uniform_random_actions = uniform_random_actions * tf_range \
-                    + tf_low
-                return uniform_random_actions
-
-            stochastic_actions = tf.cond(
-                # need to condition on noise_scale > 0 because zeroing
-                # noise_scale is how a worker signals no noise should be used
-                # (this is ugly and should be fixed by adding an "eval_mode"
-                # config flag or something)
-                tf.logical_and(enable_pure_exploration, noise_scale > 0),
-                true_fn=make_uniform_random_actions,
-                false_fn=make_noisy_actions)
-            return stochastic_actions
-
-        enable_stochastic = tf.logical_and(should_be_stochastic,
-                                           not self.config["parameter_noise"])
-        actions = tf.cond(enable_stochastic, compute_stochastic_actions,
-                          lambda: deterministic_actions)
-        return actions
-
    def _build_actor_critic_loss(self,
                                 q_t,
                                 q_tp1,
@ -580,7 +477,8 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
        return critic_loss, actor_loss, td_error

    def _build_parameter_noise(self, pnet_params):
-        self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"]
+        self.parameter_noise_sigma_val = \
+            self.config["exploration_config"].get("ou_sigma", 0.2)
        self.parameter_noise_sigma = tf.get_variable(
            initializer=tf.constant_initializer(
                self.parameter_noise_sigma_val),
@ -600,7 +498,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
        remove_noise_ops = list()
        for var, var_noise in zip(pnet_params, self.parameter_noise):
            remove_noise_ops.append(tf.assign_add(var, -var_noise))
-        self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
+        self.remove_parameter_noise_op = tf.group(*tuple(remove_noise_ops))
        generate_noise_ops = list()
        for var_noise in self.parameter_noise:
            generate_noise_ops.append(
@ -630,9 +528,6 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
            })
        return td_err

-    def reset_noise(self, sess):
-        sess.run(self.reset_noise_op)
-
    def add_parameter_noise(self):
        if self.config["parameter_noise"]:
            self.sess.run(self.add_noise_op)
@ -642,13 +537,3 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy):
        tau = tau or self.tau_value
        return self.sess.run(
            self.update_target_expr, feed_dict={self.tau: tau})
-
-    def set_epsilon(self, epsilon):
-        # set_epsilon is called by optimizer to anneal exploration as
-        # necessary, and to turn it off during evaluation. The "epsilon" part
-        # is a carry-over from DQN, which uses epsilon-greedy exploration
-        # rather than adding action noise to the output of a policy network.
-        self.cur_noise_scale = epsilon
-
-    def set_pure_exploration_phase(self, pure_exploration_phase):
-        self.cur_pure_exploration_phase = pure_exploration_phase
--- a/rllib/agents/ddpg/td3.py
+++ b/rllib/agents/ddpg/td3.py
@ -3,12 +3,10 @@
 By default, this uses a near-identical configuration to that reported in the
 TD3 paper.
 """
-
 from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
    DEFAULT_CONFIG as DDPG_CONFIG
-from ray.rllib.utils import merge_dicts

-TD3_DEFAULT_CONFIG = merge_dicts(
+TD3_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPG_CONFIG,
    {
        # largest changes: twin Q functions, delayed policy updates, and target
@ -18,15 +16,27 @@ TD3_DEFAULT_CONFIG = merge_dicts(
        "smooth_target_policy": True,
        "target_noise": 0.2,
        "target_noise_clip": 0.5,
+        "exploration_config": {
+            # TD3 uses simple Gaussian noise on top of deterministic NN-output
+            # actions (after a possible pure random phase of n timesteps).
+            "type": "GaussianNoise",
+            # For how many timesteps should we return completely random
+            # actions, before we start adding (scaled) noise?
+            "random_timesteps": 10000,
+            # Gaussian stddev of action noise for exploration.
+            "stddev": 0.1,
+            # Scaling settings by which the Gaussian noise is scaled before
+            # being added to the actions. NOTE: The scale timesteps start only
+            # after(!) any random steps have been finished.
+            # By default, do not anneal over time (fixed 1.0).
+            "initial_scale": 1.0,
+            "final_scale": 1.0,
+            "scale_timesteps": 1
+        },

-        # other changes & things we want to keep fixed: IID Gaussian
-        # exploration noise, larger actor learning rate, no l2 regularisation,
-        # no Huber loss, etc.
-        "exploration_should_anneal": False,
-        "exploration_noise_type": "gaussian",
-        "exploration_gaussian_sigma": 0.1,
+        # other changes & things we want to keep fixed:
+        # larger actor learning rate, no l2 regularisation, no Huber loss, etc.
        "learning_starts": 10000,
-        "pure_exploration_steps": 10000,
        "actor_hiddens": [400, 300],
        "critic_hiddens": [400, 300],
        "n_step": 1,
@ -40,14 +50,12 @@ TD3_DEFAULT_CONFIG = merge_dicts(
        "target_network_update_freq": 0,
        "num_workers": 0,
        "num_gpus_per_worker": 0,
-        "per_worker_exploration": False,
        "worker_side_prioritization": False,
        "buffer_size": 1000000,
        "prioritized_replay": False,
        "clip_rewards": False,
        "use_state_preprocessor": False,
-    },
-)
+    })

 TD3Trainer = DDPGTrainer.with_updates(
    name="TD3", default_config=TD3_DEFAULT_CONFIG)
--- a/rllib/agents/ddpg/tests/test_ddpg.py
+++ b/rllib/agents/ddpg/tests/test_ddpg.py
@ -0,0 +1,87 @@
+import numpy as np
+import unittest
+
+import ray.rllib.agents.ddpg as ddpg
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check
+
+tf = try_import_tf()
+
+
+class TestDDPG(unittest.TestCase):
+    def test_ddpg_compilation(self):
+        """Test whether a DDPGTrainer can be built with both frameworks."""
+        config = ddpg.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+
+        # Test against all frameworks.
+        for fw in ["tf", "eager", "torch"]:
+            if fw != "tf":
+                continue
+            config["eager"] = True if fw == "eager" else False
+            config["use_pytorch"] = True if fw == "torch" else False
+            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
+            num_iterations = 2
+            for i in range(num_iterations):
+                results = trainer.train()
+                print(results)
+
+    def test_ddpg_exploration_and_with_random_prerun(self):
+        """Tests DDPG's Exploration (w/ random actions for n timesteps)."""
+        config = ddpg.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+        obs = np.array([0.0, 0.1, -0.1])
+
+        # Test against all frameworks.
+        for fw in ["tf", "eager", "torch"]:
+            if fw != "tf":
+                continue
+            config["eager"] = True if fw == "eager" else False
+            config["use_pytorch"] = True if fw == "torch" else False
+
+            # Default OUNoise setup.
+            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
+            # Setting explore=False should always return the same action.
+            a_ = trainer.compute_action(obs, explore=False)
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=False)
+                check(a, a_)
+            # explore=None (default: explore) should return different actions.
+            actions = []
+            for _ in range(50):
+                actions.append(trainer.compute_action(obs))
+            check(np.std(actions), 0.0, false=True)
+
+            # Check randomness at beginning.
+            config["exploration_config"] = {
+                # Act randomly at beginning ...
+                "random_timesteps": 50,
+                # Then act very closely to deterministic actions thereafter.
+                "ou_base_scale": 0.001,
+                "initial_scale": 0.001,
+                "final_scale": 0.001,
+            }
+            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
+            # ts=1 (get a deterministic action as per explore=False).
+            deterministic_action = trainer.compute_action(obs, explore=False)
+            # ts=2-5 (in random window).
+            random_a = []
+            for _ in range(49):
+                random_a.append(trainer.compute_action(obs, explore=True))
+                check(random_a[-1], deterministic_action, false=True)
+            self.assertTrue(np.std(random_a) > 0.5)
+
+            # ts > 50 (a=deterministic_action + scale * N[0,1])
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=True)
+                check(a, deterministic_action, rtol=0.1)
+
+            # ts >> 50 (BUT: explore=False -> expect deterministic action).
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=False)
+                check(a, deterministic_action)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main(verbosity=1)
--- a/rllib/agents/ddpg/tests/test_td3.py
+++ b/rllib/agents/ddpg/tests/test_td3.py
@ -0,0 +1,87 @@
+import numpy as np
+import unittest
+
+import ray.rllib.agents.ddpg.td3 as td3
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check
+
+tf = try_import_tf()
+
+
+class TestTD3(unittest.TestCase):
+    def test_td3_compilation(self):
+        """Test whether a TD3Trainer can be built with both frameworks."""
+        config = td3.TD3_DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+
+        # Test against all frameworks.
+        for fw in ["tf", "eager", "torch"]:
+            if fw != "tf":
+                continue
+            config["eager"] = True if fw == "eager" else False
+            config["use_pytorch"] = True if fw == "torch" else False
+            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            num_iterations = 2
+            for i in range(num_iterations):
+                results = trainer.train()
+                print(results)
+
+    def test_td3_exploration_and_with_random_prerun(self):
+        """Tests TD3's Exploration (w/ random actions for n timesteps)."""
+        config = td3.TD3_DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+        obs = np.array([0.0, 0.1, -0.1])
+
+        # Test against all frameworks.
+        for fw in ["tf", "eager", "torch"]:
+            if fw != "tf":
+                continue
+            config["eager"] = True if fw == "eager" else False
+            config["use_pytorch"] = True if fw == "torch" else False
+
+            # Default GaussianNoise setup.
+            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            # Setting explore=False should always return the same action.
+            a_ = trainer.compute_action(obs, explore=False)
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=False)
+                check(a, a_)
+            # explore=None (default: explore) should return different actions.
+            actions = []
+            for _ in range(50):
+                actions.append(trainer.compute_action(obs))
+            check(np.std(actions), 0.0, false=True)
+
+            # Check randomness at beginning.
+            config["exploration_config"] = {
+                # Act randomly at beginning ...
+                "random_timesteps": 30,
+                # Then act very closely to deterministic actions thereafter.
+                "stddev": 0.001,
+                "initial_scale": 0.001,
+                "final_scale": 0.001,
+            }
+            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            # ts=1 (get a deterministic action as per explore=False).
+            deterministic_action = trainer.compute_action(obs, explore=False)
+            # ts=2-5 (in random window).
+            random_a = []
+            for _ in range(29):
+                random_a.append(trainer.compute_action(obs, explore=True))
+                check(random_a[-1], deterministic_action, false=True)
+            self.assertTrue(np.std(random_a) > 0.5)
+
+            # ts > 30 (a=deterministic_action + scale * N[0,1])
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=True)
+                check(a, deterministic_action, rtol=0.1)
+
+            # ts >> 30 (BUT: explore=False -> expect deterministic action).
+            for _ in range(50):
+                a = trainer.compute_action(obs, explore=False)
+                check(a, deterministic_action)
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main(verbosity=1)
--- a/rllib/agents/dqn/dqn.py
+++ b/rllib/agents/dqn/dqn.py
@ -211,7 +211,7 @@ def validate_config_and_setup_param_noise(config):
    if config.get("soft_q", DEPRECATED_VALUE) != DEPRECATED_VALUE:
        deprecation_warning(
            "soft_q", "exploration_config={"
-            "type=StochasticSampling, temperature=[float]"
+            "type=SoftQ, temperature=[float]"
            "}")
        config["exploration_config"] = {
            "type": "SoftQ",
--- a/rllib/agents/dqn/tests/test_dqn.py
+++ b/rllib/agents/dqn/tests/test_dqn.py
@ -15,14 +15,14 @@ class TestDQN(unittest.TestCase):
        config["num_workers"] = 0  # Run locally.

        # tf.
-        config["eager"] = True
+        config["eager"] = False
        trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
        num_iterations = 2
        for i in range(num_iterations):
            results = trainer.train()
            print(results)

-        config["eager"] = False
+        config["eager"] = True
        trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
        num_iterations = 2
        for i in range(num_iterations):
--- a/rllib/agents/trainer.py
+++ b/rllib/agents/trainer.py
@ -550,14 +550,11 @@ class Trainer(Trainable):
        else:
            self.env_creator = lambda env_config: None

-        # Merge the supplied config with the class default.
-        merged_config = copy.deepcopy(self._default_config)
-        merged_config = deep_update(merged_config, config,
-                                    self._allow_unknown_configs,
-                                    self._allow_unknown_subkeys,
-                                    self._override_all_subkeys_if_type_changes)
+        # Merge the supplied config with the class default, but store the
+        # user-provided one.
        self.raw_user_config = config
-        self.config = merged_config
+        self.config = Trainer.merge_trainer_configs(self._default_config,
+                                                    config)

        if self.config["normalize_actions"]:
            inner = self.env_creator
@ -767,8 +764,7 @@ class Trainer(Trainable):
            preprocessed, update=False)

        # Figure out the current (sample) time step and pass it into Policy.
-        timestep = self.optimizer.num_steps_sampled \
-            if self._has_policy_optimizer() else None
+        self.global_vars["timestep"] += 1

        result = self.get_policy(policy_id).compute_single_action(
            filtered_obs,
@ -778,7 +774,7 @@ class Trainer(Trainable):
            info,
            clip_actions=self.config["clip_actions"],
            explore=explore,
-            timestep=timestep)
+            timestep=self.global_vars["timestep"])

        if state or full_fetch:
            return result
@ -878,6 +874,13 @@ class Trainer(Trainable):
                "the DEFAULT_CONFIG defined by each agent for more info.\n\n"
                "The config of this agent is: {}".format(config))

+    @classmethod
+    def merge_trainer_configs(cls, config1, config2):
+        config1 = copy.deepcopy(config1)
+        return deep_update(config1, config2, cls._allow_unknown_configs,
+                           cls._allow_unknown_subkeys,
+                           cls._override_all_subkeys_if_type_changes)
+
    @staticmethod
    def _validate_config(config):
        if "policy_graphs" in config["multiagent"]:
--- a/rllib/models/tf/tf_action_dist.py
+++ b/rllib/models/tf/tf_action_dist.py
@ -266,7 +266,8 @@ class SquashedGaussian(TFActionDistribution):
 class Deterministic(TFActionDistribution):
    """Action distribution that returns the input values directly.

-    This is similar to DiagGaussian with standard deviation zero.
+    This is similar to DiagGaussian with standard deviation zero (thus only
+    requiring the "mean" values as NN output).
    """

    @override(ActionDistribution)
--- a/rllib/offline/off_policy_estimator.py
+++ b/rllib/offline/off_policy_estimator.py
@ -83,8 +83,8 @@ class OffPolicyEstimator:
                "Off-policy estimation is not possible unless the inputs "
                "include action probabilities (i.e., the policy is stochastic "
                "and emits the 'action_prob' key). For DQN this means using "
-                "`soft_q: True`. You can also set `input_evaluation: []` to "
-                "disable estimation.")
+                "`exploration_config: {type: 'SoftQ'}`. You can also set "
+                "`input_evaluation: []` to disable estimation.")

    @DeveloperAPI
    def get_metrics(self):
--- a/rllib/policy/eager_tf_policy.py
+++ b/rllib/policy/eager_tf_policy.py
@ -37,7 +37,7 @@ def _convert_to_numpy(x):
    if x is None:
        return None
    try:
-        return x.numpy()
+        return tf.nest.map_structure(lambda component: component.numpy(), x)
    except AttributeError:
        raise TypeError(
            ("Object of type {} has no method to convert to numpy.").format(
@ -402,6 +402,10 @@ def build_eager_tf_policy(name,
                zip([(tf.convert_to_tensor(g) if g is not None else None)
                     for g in gradients], self.model.trainable_variables()))

+        @override(Policy)
+        def get_exploration_info(self):
+            return _convert_to_numpy(self.exploration.get_info())
+
        @override(Policy)
        def get_weights(self):
            variables = self.variables()
--- a/rllib/policy/tf_policy.py
+++ b/rllib/policy/tf_policy.py
@ -13,7 +13,6 @@ from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.debug import summarize
-from ray.rllib.utils.exploration.exploration import Exploration
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
@ -332,8 +331,7 @@ class TFPolicy(Policy):

    @override(Policy)
    def get_exploration_info(self):
-        if isinstance(self.exploration, Exploration):
-            return self._sess.run(self.exploration_info)
+        return self._sess.run(self.exploration_info)

    @override(Policy)
    def get_weights(self):
--- a/rllib/tests/test_checkpoint_restore.py
+++ b/rllib/tests/test_checkpoint_restore.py
@ -20,16 +20,22 @@ def get_mean_action(alg, obs):
 ray.init(num_cpus=10, object_store_memory=1e9)

 CONFIGS = {
-    "SAC": {},
+    "SAC": {
+        "explore": False,
+    },
    "ES": {
+        "explore": False,
        "episodes_per_batch": 10,
        "train_batch_size": 100,
        "num_workers": 2,
        "noise_size": 2500000,
        "observation_filter": "MeanStdFilter"
    },
-    "DQN": {},
+    "DQN": {
+        "explore": False
+    },
    "APEX_DDPG": {
+        "explore": False,
        "observation_filter": "MeanStdFilter",
        "num_workers": 2,
        "min_iter_time_s": 1,
@ -38,19 +44,21 @@ CONFIGS = {
        },
    },
    "DDPG": {
-        "pure_exploration_steps": 0,
-        "exploration_ou_noise_scale": 0.0,
+        "explore": False,
        "timesteps_per_iteration": 100
    },
    "PPO": {
+        "explore": False,
        "num_sgd_iter": 5,
        "train_batch_size": 1000,
        "num_workers": 2
    },
    "A3C": {
+        "explore": False,
        "num_workers": 1
    },
    "ARS": {
+        "explore": False,
        "num_rollouts": 10,
        "num_workers": 2,
        "noise_size": 2500000,
@ -70,7 +78,7 @@ def test_ckpt_restore(use_object_store, alg_name, failures):
        alg2 = cls(config=CONFIGS[name], env="CartPole-v0")
        env = gym.make("CartPole-v0")

-    for _ in range(3):
+    for _ in range(2):
        res = alg1.train()
        print("current status: " + str(res))

--- a/rllib/tests/test_explorations.py
+++ b/rllib/tests/test_explorations.py
@ -4,6 +4,8 @@ import unittest

 import ray
 import ray.rllib.agents.a3c as a3c
+import ray.rllib.agents.ddpg as ddpg
+import ray.rllib.agents.ddpg.td3 as td3
 import ray.rllib.agents.dqn as dqn
 import ray.rllib.agents.impala as impala
 import ray.rllib.agents.pg as pg
@ -27,9 +29,12 @@ def test_explorations(run,
    # Test all frameworks.
    for fw in ["torch", "eager", "tf"]:
        if fw == "torch" and \
-                run in [dqn.DQNTrainer, dqn.SimpleQTrainer,
-                        impala.ImpalaTrainer, sac.SACTrainer]:
+                run in [ddpg.DDPGTrainer, dqn.DQNTrainer, dqn.SimpleQTrainer,
+                        impala.ImpalaTrainer, sac.SACTrainer, td3.TD3Trainer]:
            continue
+        elif fw == "eager" and run in [ddpg.DDPGTrainer, td3.TD3Trainer]:
+            continue
+
        print("Testing {} in framework={}".format(run, fw))
        config["eager"] = (fw == "eager")
        config["use_pytorch"] = (fw == "torch")
@ -38,9 +43,8 @@ def test_explorations(run,
        # exploration class.
        for exploration in [None, "Random"]:
            if exploration == "Random":
-                # TODO(sven): Random doesn't work for cont. action spaces
-                #  or IMPALA yet.
-                if env == "Pendulum-v0" or run is impala.ImpalaTrainer:
+                # TODO(sven): Random doesn't work for IMPALA yet.
+                if run is impala.ImpalaTrainer:
                    continue
                config["exploration_config"] = {"type": "Random"}
            print("exploration={}".format(exploration or "default"))
@ -108,6 +112,14 @@ class TestExplorations(unittest.TestCase):
            np.array([0.0, 0.1, 0.0, 0.0]),
            prev_a=np.array(1))

+    def test_ddpg(self):
+        test_explorations(
+            ddpg.DDPGTrainer,
+            "Pendulum-v0",
+            ddpg.DEFAULT_CONFIG,
+            np.array([0.0, 0.1, 0.0]),
+            expected_mean_action=0.0)
+
    def test_simple_dqn(self):
        test_explorations(dqn.SimpleQTrainer, "CartPole-v0",
                          dqn.DEFAULT_CONFIG, np.array([0.0, 0.1, 0.0, 0.0]))
@ -157,6 +169,14 @@ class TestExplorations(unittest.TestCase):
            np.array([0.0, 0.1, 0.0]),
            expected_mean_action=0.0)

+    def test_td3(self):
+        test_explorations(
+            td3.TD3Trainer,
+            "Pendulum-v0",
+            td3.TD3_DEFAULT_CONFIG,
+            np.array([0.0, 0.1, 0.0]),
+            expected_mean_action=0.0)
+

 if __name__ == "__main__":
    unittest.main(verbosity=2)
--- a/rllib/tests/test_supported_spaces.py
+++ b/rllib/tests/test_supported_spaces.py
@ -176,7 +176,9 @@ class ModelSupportedSpaces(unittest.TestCase):
    def test_ddpg(self):
        check_support(
            "DDPG", {
-                "exploration_ou_noise_scale": 100.0,
+                "exploration_config": {
+                    "ou_base_scale": 100.0
+                },
                "timesteps_per_iteration": 1,
                "use_state_preprocessor": True,
            },
--- a/rllib/tuned_examples/halfcheetah-ddpg.yaml
+++ b/rllib/tuned_examples/halfcheetah-ddpg.yaml
@ -15,14 +15,15 @@ halfcheetah-ddpg:
        env_config: {}

        # === Exploration ===
-        exploration_should_anneal: True
-        schedule_max_timesteps: 100000
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 10000
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+
        timesteps_per_iteration: 1000
-        exploration_fraction: 0.1
-        exploration_final_scale: 0.02
-        exploration_ou_noise_scale: 0.1
-        exploration_ou_theta: 0.15
-        exploration_ou_sigma: 0.2
        target_network_update_freq: 0
        tau: 0.001

@ -47,7 +48,6 @@ halfcheetah-ddpg:
        # === Parallelism ===
        num_workers: 0
        num_gpus_per_worker: 0
-        per_worker_exploration: False
        worker_side_prioritization: False

        # === Evaluation ===
--- a/rllib/tuned_examples/invertedpendulum-td3.yaml
+++ b/rllib/tuned_examples/invertedpendulum-td3.yaml
@ -15,7 +15,8 @@ invertedpendulum-td3:

        # === Exploration ===
        learning_starts: 1000
-        pure_exploration_steps: 1000
+        exploration_config:
+            random_timesteps: 1000

        # === Evaluation ===
        evaluation_interval: 1
--- a/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml
+++ b/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml
@ -7,7 +7,8 @@ mountaincarcontinuous-apex-ddpg:
    config:
        clip_rewards: False
        num_workers: 16
-        exploration_ou_noise_scale: 1.0
+        exploration_config:
+            ou_base_scale: 1.0
        n_step: 3
        target_network_update_freq: 50000
        tau: 1.0
--- a/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
+++ b/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml
@ -15,14 +15,16 @@ mountaincarcontinuous-ddpg:
        env_config: {}

        # === Exploration ===
-        exploration_should_anneal: True
-        schedule_max_timesteps: 100000
+        exploration_config:
+            initial_scale: 1.0
+            final_scale: 0.02
+            scale_timesteps: 40000
+            ou_base_scale: 0.75
+            ou_theta: 0.15
+            ou_sigma: 0.2
+
        timesteps_per_iteration: 1000
-        exploration_fraction: 0.4
-        exploration_final_scale: 0.02
-        exploration_ou_noise_scale: 0.75
-        exploration_ou_theta: 0.15
-        exploration_ou_sigma: 0.2
+
        target_network_update_freq: 0
        tau: 0.01

--- a/rllib/tuned_examples/mujoco-td3.yaml
+++ b/rllib/tuned_examples/mujoco-td3.yaml
@ -17,7 +17,8 @@ mujoco-td3:
    config:
        # === Exploration ===
        learning_starts: 10000
-        pure_exploration_steps: 10000
+        exploration_config:
+            random_timesteps: 10000

        # === Evaluation ===
        evaluation_interval: 5
--- a/rllib/tuned_examples/pendulum-ddpg.yaml
+++ b/rllib/tuned_examples/pendulum-ddpg.yaml
@ -15,14 +15,16 @@ pendulum-ddpg:
        env_config: {}

        # === Exploration ===
-        exploration_should_anneal: True
-        schedule_max_timesteps: 100000
+        exploration_config:
+            type: "OrnsteinUhlenbeckNoise"
+            scale_timesteps: 10000
+            initial_scale: 1.0,
+            final_scale: 0.02
+            ou_base_scale: 0.1
+            ou_theta: 0.15
+            ou_sigma: 0.2
+
        timesteps_per_iteration: 600
-        exploration_fraction: 0.1
-        exploration_final_scale: 0.02
-        exploration_ou_noise_scale: 0.1
-        exploration_ou_theta: 0.15
-        exploration_ou_sigma: 0.2
        target_network_update_freq: 0
        tau: 0.001

@ -47,7 +49,7 @@ pendulum-ddpg:
        # === Parallelism ===
        num_workers: 0
        num_gpus_per_worker: 0
-        per_worker_exploration: False
+        #per_worker_exploration: False
        worker_side_prioritization: False

        # === Evaluation ===
--- a/rllib/tuned_examples/pendulum-td3.yaml
+++ b/rllib/tuned_examples/pendulum-td3.yaml
@ -12,7 +12,8 @@ pendulum-ddpg:

        # === Exploration ===
        learning_starts: 5000
-        pure_exploration_steps: 5000
+        exploration_config:
+            random_timesteps: 5000

        # === Evaluation ===
        evaluation_interval: 1
--- a/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-ddpg.yaml
@ -7,4 +7,3 @@ pendulum-ddpg:
    config:
        use_huber: True
        clip_rewards: False
-        exploration_fraction: 0.1
--- a/rllib/tuned_examples/regression_tests/pendulum-td3.yaml
+++ b/rllib/tuned_examples/regression_tests/pendulum-td3.yaml
@ -0,0 +1,6 @@
+pendulum-td3:
+    env: Pendulum-v0
+    run: TD3
+    stop:
+        episode_reward_mean: -900
+        timesteps_total: 100000
--- a/rllib/utils/exploration/init.py
+++ b/rllib/utils/exploration/init.py
@ -1,7 +1,14 @@
 from ray.rllib.utils.exploration.exploration import Exploration
 from ray.rllib.utils.exploration.epsilon_greedy import EpsilonGreedy
+from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
+from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
+    OrnsteinUhlenbeckNoise
 from ray.rllib.utils.exploration.per_worker_epsilon_greedy import \
    PerWorkerEpsilonGreedy
+from ray.rllib.utils.exploration.per_worker_gaussian_noise import \
+    PerWorkerGaussianNoise
+from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \
+    PerWorkerOrnsteinUhlenbeckNoise
 from ray.rllib.utils.exploration.random import Random
 from ray.rllib.utils.exploration.soft_q import SoftQ
 from ray.rllib.utils.exploration.stochastic_sampling import \
@ -10,7 +17,11 @@ from ray.rllib.utils.exploration.stochastic_sampling import \
 __all__ = [
    "Exploration",
    "EpsilonGreedy",
+    "GaussianNoise",
+    "OrnsteinUhlenbeckNoise",
    "PerWorkerEpsilonGreedy",
+    "PerWorkerGaussianNoise",
+    "PerWorkerOrnsteinUhlenbeckNoise",
    "Random",
    "SoftQ",
    "StochasticSampling",
--- a/rllib/utils/exploration/epsilon_greedy.py
+++ b/rllib/utils/exploration/epsilon_greedy.py
@ -23,10 +23,9 @@ class EpsilonGreedy(Exploration):
                 initial_epsilon=1.0,
                 final_epsilon=0.05,
                 epsilon_timesteps=int(1e5),
-                 num_workers=None,
-                 worker_index=None,
                 epsilon_schedule=None,
-                 framework="tf"):
+                 framework="tf",
+                 **kwargs):
        """

        Args:
@ -35,21 +34,13 @@ class EpsilonGreedy(Exploration):
            final_epsilon (float): The final epsilon value to use.
            epsilon_timesteps (int): The time step after which epsilon should
                always be `final_epsilon`.
-            num_workers (Optional[int]): The overall number of workers used.
-            worker_index (Optional[int]): The index of the Worker using this
-                Exploration.
            epsilon_schedule (Optional[Schedule]): An optional Schedule object
                to use (instead of constructing one from the given parameters).
            framework (Optional[str]): One of None, "tf", "torch".
        """
-        # For now, require Discrete action space (may loosen this restriction
-        # in the future).
        assert framework is not None
        super().__init__(
-            action_space=action_space,
-            num_workers=num_workers,
-            worker_index=worker_index,
-            framework=framework)
+            action_space=action_space, framework=framework, **kwargs)

        self.epsilon_schedule = epsilon_schedule or PiecewiseSchedule(
            endpoints=[(0, initial_epsilon),
@ -85,16 +76,15 @@ class EpsilonGreedy(Exploration):
        Returns:
            tf.Tensor: The tf exploration-action op.
        """
-        epsilon = tf.convert_to_tensor(
-            self.epsilon_schedule(timestep if timestep is not None else
-                                  self.last_timestep))
+        epsilon = self.epsilon_schedule(timestep if timestep is not None else
+                                        self.last_timestep)

        # Get the exploit action as the one with the highest logit value.
        exploit_action = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(q_values)[0]
-        # Mask out actions with q-value=-inf so that we don't
-        # even consider them for exploration.
+        # Mask out actions with q-value=-inf so that we don't even consider
+        # them for exploration.
        random_valid_action_logits = tf.where(
            tf.equal(q_values, tf.float32.min),
            tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
@ -130,7 +120,7 @@ class EpsilonGreedy(Exploration):
        Returns:
            torch.Tensor: The exploration-action.
        """
-        # Set last time step or (if not given) increase by one.
+        # Set last timestep or (if not given) increase by one.
        self.last_timestep = timestep if timestep is not None else \
            self.last_timestep + 1

--- a/rllib/utils/exploration/exploration.py
+++ b/rllib/utils/exploration/exploration.py
@ -13,6 +13,7 @@ class Exploration:

    def __init__(self,
                 action_space=None,
+                 *,
                 num_workers=None,
                 worker_index=None,
                 framework="tf"):
--- a/rllib/utils/exploration/gaussian_noise.py
+++ b/rllib/utils/exploration/gaussian_noise.py
@ -0,0 +1,165 @@
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.exploration.exploration import Exploration
+from ray.rllib.utils.exploration.random import Random
+from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
+    get_variable
+from ray.rllib.utils.schedules.piecewise_schedule import PiecewiseSchedule
+
+tf = try_import_tf()
+torch, _ = try_import_torch()
+
+
+class GaussianNoise(Exploration):
+    """An exploration that adds white noise to continuous actions.
+
+    If explore=True, returns actions plus scale (<-annealed over time) x
+        Gaussian noise. Also, some completely random period is possible at the
+        beginning.
+    If explore=False, returns the deterministic action.
+    """
+
+    def __init__(self,
+                 action_space,
+                 *,
+                 random_timesteps=1000,
+                 stddev=0.1,
+                 initial_scale=1.0,
+                 final_scale=0.02,
+                 scale_timesteps=10000,
+                 scale_schedule=None,
+                 framework="tf",
+                 **kwargs):
+        """Initializes a GaussianNoise Exploration object.
+
+        Args:
+            action_space (Space): The gym action space used by the environment.
+            random_timesteps (int): The number of timesteps for which to act
+                completely randomly. Only after this number of timesteps, the
+                `self.scale` annealing process will start (see below).
+            stddev (float): The stddev (sigma) to use for the
+                Gaussian noise to be added to the actions.
+            initial_scale (float): The initial scaling weight to multiply
+                the noise with.
+            final_scale (float): The final scaling weight to multiply
+                the noise with.
+            scale_timesteps (int): The timesteps over which to linearly anneal
+                the scaling factor (after(!) having used random actions for
+                `random_timesteps` steps.
+            scale_schedule (Optional[Schedule]): An optional Schedule object
+                to use (instead of constructing one from the given parameters).
+            framework (Optional[str]): One of None, "tf", "torch".
+        """
+        assert framework is not None
+        super().__init__(action_space, framework=framework, **kwargs)
+
+        self.random_timesteps = random_timesteps
+        self.random_exploration = Random(
+            action_space, framework=self.framework)
+        self.stddev = stddev
+        # The `scale` annealing schedule.
+        self.scale_schedule = scale_schedule or PiecewiseSchedule(
+            endpoints=[(random_timesteps, initial_scale),
+                       (random_timesteps + scale_timesteps, final_scale)],
+            outside_value=final_scale,
+            framework=self.framework)
+
+        # The current timestep value (tf-var or python int).
+        self.last_timestep = get_variable(
+            0, framework=self.framework, tf_name="timestep")
+
+    @override(Exploration)
+    def get_exploration_action(self,
+                               distribution_inputs,
+                               action_dist_class,
+                               model=None,
+                               explore=True,
+                               timestep=None):
+        # Adds IID Gaussian noise for exploration, TD3-style.
+        action_dist = action_dist_class(distribution_inputs, model)
+
+        if self.framework == "torch":
+            return self._get_torch_exploration_action(action_dist, explore,
+                                                      timestep)
+        else:
+            return self._get_tf_exploration_action_op(action_dist, explore,
+                                                      timestep)
+
+    def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
+        ts = timestep if timestep is not None else self.last_timestep
+
+        # The deterministic actions (if explore=False).
+        deterministic_actions = action_dist.deterministic_sample()
+
+        # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
+        gaussian_sample = self.scale_schedule(ts) * tf.random_normal(
+            tf.shape(deterministic_actions), stddev=self.stddev)
+
+        # Stochastic actions could either be: random OR action + noise.
+        random_actions, _ = \
+            self.random_exploration.get_tf_exploration_action_op(
+                action_dist, explore)
+        stochastic_actions = tf.cond(
+            pred=ts <= self.random_timesteps,
+            true_fn=lambda: random_actions,
+            false_fn=lambda: tf.clip_by_value(
+                deterministic_actions + gaussian_sample,
+                self.action_space.low * tf.ones_like(deterministic_actions),
+                self.action_space.high * tf.ones_like(deterministic_actions))
+        )
+
+        # Chose by `explore` (main exploration switch).
+        batch_size = tf.shape(deterministic_actions)[0]
+        action = tf.cond(
+            pred=tf.constant(explore, dtype=tf.bool)
+            if isinstance(explore, bool) else explore,
+            true_fn=lambda: stochastic_actions,
+            false_fn=lambda: deterministic_actions)
+        # Logp=always zero.
+        logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
+
+        # Increment `last_timestep` by 1 (or set to `timestep`).
+        assign_op = \
+            tf.assign_add(self.last_timestep, 1) if timestep is None else \
+            tf.assign(self.last_timestep, timestep)
+        with tf.control_dependencies([assign_op]):
+            return action, logp
+
+    def _get_torch_exploration_action(self, action_dist, explore, timestep):
+        # Set last timestep or (if not given) increase by one.
+        self.last_timestep = timestep if timestep is not None else \
+            self.last_timestep + 1
+
+        # Apply exploration.
+        if explore:
+            # Random exploration phase.
+            if self.last_timestep <= self.random_timesteps:
+                action, _ = \
+                    self.random_exploration.get_torch_exploration_action(
+                        action_dist, True)
+            # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
+            else:
+                det_actions = action_dist.deterministic_sample()
+                scale = self.scale_schedule(self.last_timestep)
+                gaussian_sample = scale * torch.normal(
+                    mean=0.0, stddev=self.stddev, size=det_actions.size())
+                action = torch.clamp(
+                    det_actions + gaussian_sample,
+                    self.action_space.low * torch.ones_like(det_actions),
+                    self.action_space.high * torch.ones_like(det_actions))
+        # No exploration -> Return deterministic actions.
+        else:
+            action = action_dist.deterministic_sample()
+
+        # Logp=always zero.
+        logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
+
+        return action, logp
+
+    @override(Exploration)
+    def get_info(self):
+        """Returns the current scale value.
+
+        Returns:
+            Union[float,tf.Tensor[float]]: The current scale value.
+        """
+        return self.scale_schedule(self.last_timestep)
--- a/rllib/utils/exploration/ornstein_uhlenbeck_noise.py
+++ b/rllib/utils/exploration/ornstein_uhlenbeck_noise.py
@ -0,0 +1,165 @@
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
+from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
+    get_variable
+
+tf = try_import_tf()
+torch, _ = try_import_torch()
+
+
+class OrnsteinUhlenbeckNoise(GaussianNoise):
+    """An exploration that adds Ornstein-Uhlenbeck noise to continuous actions.
+
+    If explore=True, returns sampled actions plus a noise term X,
+    which changes according to this formula:
+    Xt+1 = -theta*Xt + sigma*N[0,stddev], where theta, sigma and stddev are
+    constants. Also, some completely random period is possible at the
+    beginning.
+    If explore=False, returns the deterministic action.
+    """
+
+    def __init__(self,
+                 action_space,
+                 *,
+                 ou_theta=0.15,
+                 ou_sigma=0.2,
+                 ou_base_scale=0.1,
+                 random_timesteps=1000,
+                 initial_scale=1.0,
+                 final_scale=0.02,
+                 scale_timesteps=10000,
+                 scale_schedule=None,
+                 framework="tf",
+                 **kwargs):
+        """Initializes an Ornstein-Uhlenbeck Exploration object.
+
+        Args:
+            action_space (Space): The gym action space used by the environment.
+            ou_theta (float): The theta parameter of the Ornstein-Uhlenbeck
+                process.
+            ou_sigma (float): The sigma parameter of the Ornstein-Uhlenbeck
+                process.
+            ou_base_scale (float): A fixed scaling factor, by which all OU-
+                noise is multiplied. NOTE: This is on top of the parent
+                GaussianNoise's scaling.
+            random_timesteps (int): The number of timesteps for which to act
+                completely randomly. Only after this number of timesteps, the
+                `self.scale` annealing process will start (see below).
+            initial_scale (float): The initial scaling weight to multiply
+                the noise with.
+            final_scale (float): The final scaling weight to multiply
+                the noise with.
+            scale_timesteps (int): The timesteps over which to linearly anneal
+                the scaling factor (after(!) having used random actions for
+                `random_timesteps` steps.
+            scale_schedule (Optional[Schedule]): An optional Schedule object
+                to use (instead of constructing one from the given parameters).
+            framework (Optional[str]): One of None, "tf", "torch".
+        """
+        super().__init__(
+            action_space,
+            random_timesteps=random_timesteps,
+            initial_scale=initial_scale,
+            final_scale=final_scale,
+            scale_timesteps=scale_timesteps,
+            scale_schedule=scale_schedule,
+            stddev=1.0,  # Force `self.stddev` to 1.0.
+            framework=framework,
+            **kwargs)
+        self.ou_theta = ou_theta
+        self.ou_sigma = ou_sigma
+        self.ou_base_scale = ou_base_scale
+
+        # The current OU-state value (gets updated each time, an eploration
+        # action is computed).
+        self.ou_state = get_variable(
+            self.action_space.low.size * [.0],
+            framework=self.framework,
+            tf_name="ou_state")
+
+    @override(GaussianNoise)
+    def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
+        ts = timestep if timestep is not None else self.last_timestep
+        scale = self.scale_schedule(ts)
+
+        # The deterministic actions (if explore=False).
+        deterministic_actions = action_dist.deterministic_sample()
+
+        # Apply base-scaled and time-annealed scaled OU-noise to
+        # deterministic actions.
+        gaussian_sample = tf.random_normal(
+            shape=[self.action_space.low.size], stddev=self.stddev)
+        ou_new = self.ou_theta * -self.ou_state + \
+            self.ou_sigma * gaussian_sample
+        ou_state_new = tf.assign_add(self.ou_state, ou_new)
+        noise = scale * self.ou_base_scale * ou_state_new * \
+            (self.action_space.high - self.action_space.low)
+        stochastic_actions = tf.clip_by_value(
+            deterministic_actions + noise,
+            self.action_space.low * tf.ones_like(deterministic_actions),
+            self.action_space.high * tf.ones_like(deterministic_actions))
+
+        # Stochastic actions could either be: random OR action + noise.
+        random_actions, _ = \
+            self.random_exploration.get_tf_exploration_action_op(
+                action_dist, explore)
+        exploration_actions = tf.cond(
+            pred=ts <= self.random_timesteps,
+            true_fn=lambda: random_actions,
+            false_fn=lambda: stochastic_actions)
+
+        # Chose by `explore` (main exploration switch).
+        action = tf.cond(
+            pred=tf.constant(explore, dtype=tf.bool)
+            if isinstance(explore, bool) else explore,
+            true_fn=lambda: exploration_actions,
+            false_fn=lambda: deterministic_actions)
+        # Logp=always zero.
+        batch_size = tf.shape(deterministic_actions)[0]
+        logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
+
+        # Increment `last_timestep` by 1 (or set to `timestep`).
+        assign_op = \
+            tf.assign_add(self.last_timestep, 1) if timestep is None else \
+            tf.assign(self.last_timestep, timestep)
+        with tf.control_dependencies([assign_op, ou_state_new]):
+            return action, logp
+
+    @override(GaussianNoise)
+    def _get_torch_exploration_action(self, action_dist, explore, timestep):
+        # Set last timestep or (if not given) increase by one.
+        self.last_timestep = timestep if timestep is not None else \
+            self.last_timestep + 1
+
+        # Apply exploration.
+        if explore:
+            # Random exploration phase.
+            if self.last_timestep <= self.random_timesteps:
+                action = self.random_exploration.get_torch_exploration_action(
+                    action_dist, True)
+            # Apply base-scaled and time-annealed scaled OU-noise to
+            # deterministic actions.
+            else:
+                det_actions = action_dist.deterministic_sample()
+                scale = self.scale_schedule(self.last_timestep)
+                gaussian_sample = scale * torch.normal(
+                    mean=0.0, stddev=1.0, size=det_actions.size())
+
+                ou_new = self.ou_theta * -self.ou_state + \
+                    self.ou_sigma * gaussian_sample
+                self.ou_state += ou_new
+                noise = scale * self.ou_base_scale * self.ou_state * \
+                    (self.action_space.high - self.action_space.low)
+                action = torch.clamp(
+                    det_actions + noise,
+                    self.action_space.low * torch.ones_like(det_actions),
+                    self.action_space.high * torch.ones_like(det_actions))
+
+        # No exploration -> Return deterministic actions.
+        else:
+            action = action_dist.deterministic_sample()
+
+        # Logp=always zero.
+        logp = torch.zeros(shape=(action.size()[0], ), dtype=torch.float32)
+
+        return action, logp
--- a/rllib/utils/exploration/per_worker_epsilon_greedy.py
+++ b/rllib/utils/exploration/per_worker_epsilon_greedy.py
@ -12,19 +12,14 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):

    def __init__(self,
                 action_space,
-                 initial_epsilon=1.0,
-                 final_epsilon=0.1,
-                 epsilon_timesteps=int(1e5),
+                 *,
                 num_workers=0,
                 worker_index=0,
-                 framework="tf"):
+                 framework="tf",
+                 **kwargs):
        """
        Args:
            action_space (Space): The gym action space used by the environment.
-            initial_epsilon (float): The initial epsilon value to use.
-            final_epsilon (float): The final epsilon value to use.
-            epsilon_timesteps (int): The time step after which epsilon should
-                always be `final_epsilon`.
            num_workers (Optional[int]): The overall number of workers used.
            worker_index (Optional[int]): The index of the Worker using this
                Exploration.
@ -42,11 +37,7 @@ class PerWorkerEpsilonGreedy(EpsilonGreedy):
                epsilon_schedule = ConstantSchedule(0.0)

        super().__init__(
-            action_space=action_space,
-            initial_epsilon=initial_epsilon,
-            final_epsilon=final_epsilon,
-            epsilon_timesteps=epsilon_timesteps,
-            num_workers=num_workers,
-            worker_index=worker_index,
+            action_space,
+            epsilon_schedule=epsilon_schedule,
            framework=framework,
-            epsilon_schedule=epsilon_schedule)
+            **kwargs)
--- a/rllib/utils/exploration/per_worker_gaussian_noise.py
+++ b/rllib/utils/exploration/per_worker_gaussian_noise.py
@ -0,0 +1,43 @@
+from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
+from ray.rllib.utils.schedules import ConstantSchedule
+
+
+class PerWorkerGaussianNoise(GaussianNoise):
+    """A per-worker Gaussian noise class for distributed algorithms.
+
+    Sets the `scale` schedules of individual workers to a constant:
+    0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
+    See Ape-X paper.
+    """
+
+    def __init__(self,
+                 action_space,
+                 *,
+                 num_workers=0,
+                 worker_index=0,
+                 framework="tf",
+                 **kwargs):
+        """
+        Args:
+            action_space (Space): The gym action space used by the environment.
+            num_workers (Optional[int]): The overall number of workers used.
+            worker_index (Optional[int]): The index of the Worker using this
+                Exploration.
+            framework (Optional[str]): One of None, "tf", "torch".
+        """
+        scale_schedule = None
+        # Use a fixed, different epsilon per worker. See: Ape-X paper.
+        if num_workers > 0:
+            if worker_index >= 0:
+                exponent = (1 + worker_index / float(num_workers - 1) * 7)
+                scale_schedule = ConstantSchedule(0.4**exponent)
+            # Local worker should have zero exploration so that eval
+            # rollouts run properly.
+            else:
+                scale_schedule = ConstantSchedule(0.0)
+
+        super().__init__(
+            action_space,
+            scale_schedule=scale_schedule,
+            framework=framework,
+            **kwargs)
--- a/rllib/utils/exploration/per_worker_ornstein_uhlenbeck_noise.py
+++ b/rllib/utils/exploration/per_worker_ornstein_uhlenbeck_noise.py
@ -0,0 +1,44 @@
+from ray.rllib.utils.exploration.ornstein_uhlenbeck_noise import \
+    OrnsteinUhlenbeckNoise
+from ray.rllib.utils.schedules import ConstantSchedule
+
+
+class PerWorkerOrnsteinUhlenbeckNoise(OrnsteinUhlenbeckNoise):
+    """A per-worker Ornstein Uhlenbeck noise class for distributed algorithms.
+
+    Sets the Gaussian `scale` schedules of individual workers to a constant:
+    0.4 ^ (1 + [worker-index] / float([num-workers] - 1) * 7)
+    See Ape-X paper.
+    """
+
+    def __init__(self,
+                 action_space,
+                 *,
+                 num_workers=0,
+                 worker_index=0,
+                 framework="tf",
+                 **kwargs):
+        """
+        Args:
+            action_space (Space): The gym action space used by the environment.
+            num_workers (Optional[int]): The overall number of workers used.
+            worker_index (Optional[int]): The index of the Worker using this
+                Exploration.
+            framework (Optional[str]): One of None, "tf", "torch".
+        """
+        scale_schedule = None
+        # Use a fixed, different epsilon per worker. See: Ape-X paper.
+        if num_workers > 0:
+            if worker_index >= 0:
+                exponent = (1 + worker_index / float(num_workers - 1) * 7)
+                scale_schedule = ConstantSchedule(0.4**exponent)
+            # Local worker should have zero exploration so that eval
+            # rollouts run properly.
+            else:
+                scale_schedule = ConstantSchedule(0.0)
+
+        super().__init__(
+            action_space,
+            scale_schedule=scale_schedule,
+            framework=framework,
+            **kwargs)
--- a/rllib/utils/exploration/random.py
+++ b/rllib/utils/exploration/random.py
@ -1,4 +1,4 @@
-from gym.spaces import Discrete
+from gym.spaces import Discrete, MultiDiscrete, Tuple

 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.exploration.exploration import Exploration
@ -18,17 +18,24 @@ class Random(Exploration):
    If explore=False, returns the greedy/max-likelihood action.
    """

-    def __init__(self, action_space, framework="tf", **kwargs):
+    def __init__(self, action_space, *, framework="tf", **kwargs):
        """Initialize a Random Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
            framework (Optional[str]): One of None, "tf", "torch".
        """
-        assert isinstance(action_space, Discrete)
        super().__init__(
            action_space=action_space, framework=framework, **kwargs)

+        # Determine py_func types, depending on our action-space.
+        if isinstance(self.action_space, (Discrete, MultiDiscrete)) or \
+                (isinstance(self.action_space, Tuple) and
+                 isinstance(self.action_space[0], (Discrete, MultiDiscrete))):
+            self.dtype_sample, self.dtype = (tf.int64, tf.int32)
+        else:
+            self.dtype_sample, self.dtype = (tf.float64, tf.float32)
+
    @override(Exploration)
    def get_exploration_action(self,
                               distribution_inputs,
@ -38,23 +45,22 @@ class Random(Exploration):
                               timestep=None):
        # Instantiate the distribution object.
        action_dist = action_dist_class(distribution_inputs, model)
-
        if self.framework == "tf":
-            return self._get_tf_exploration_action_op(action_dist, explore,
-                                                      timestep)
+            return self.get_tf_exploration_action_op(action_dist, explore)
        else:
-            return self._get_torch_exploration_action(action_dist, explore,
-                                                      timestep)
+            return self.get_torch_exploration_action(action_dist, explore)

    @tf_function(tf)
-    def _get_tf_exploration_action_op(self, action_dist, explore, timestep):
+    def get_tf_exploration_action_op(self, action_dist, explore):
        if explore:
-            action = tf.py_function(self.action_space.sample, [], tf.int64)
+            action = tf.py_function(self.action_space.sample, [],
+                                    self.dtype_sample)
            # Will be unnecessary, once we support batch/time-aware Spaces.
-            action = tf.expand_dims(tf.cast(action, dtype=tf.int32), 0)
+            action = tf.expand_dims(tf.cast(action, dtype=self.dtype), 0)
        else:
            action = tf.cast(
-                action_dist.deterministic_sample(), dtype=tf.int32)
+                action_dist.deterministic_sample(), dtype=self.dtype)
+
        # TODO(sven): Move into (deterministic_)sample(logp=True|False)
        if isinstance(action, TupleActions):
            batch_size = tf.shape(action[0][0])[0]
@ -63,12 +69,15 @@ class Random(Exploration):
        logp = tf.zeros(shape=(batch_size, ), dtype=tf.float32)
        return action, logp

-    def _get_torch_exploration_action(self, action_dist, explore, timestep):
+    def get_torch_exploration_action(self, action_dist, explore):
+        tensor_fn = torch.LongTensor if \
+            type(self.action_space) in [Discrete, MultiDiscrete] else \
+            torch.FloatTensor
        if explore:
            # Unsqueeze will be unnecessary, once we support batch/time-aware
            # Spaces.
-            action = torch.LongTensor(self.action_space.sample()).unsqueeze(0)
+            action = tensor_fn(self.action_space.sample()).unsqueeze(0)
        else:
-            action = torch.LongTensor(action_dist.deterministic_sample())
+            action = tensor_fn(action_dist.deterministic_sample())
        logp = torch.zeros((action.size()[0], ), dtype=torch.float32)
        return action, logp
--- a/rllib/utils/exploration/soft_q.py
+++ b/rllib/utils/exploration/soft_q.py
@ -10,7 +10,11 @@ class SoftQ(StochasticSampling):
    output divided by the temperature. Returns the argmax iff explore=False.
    """

-    def __init__(self, action_space, temperature=1.0, framework="tf",
+    def __init__(self,
+                 action_space,
+                 *,
+                 temperature=1.0,
+                 framework="tf",
                 **kwargs):
        """Initializes a SoftQ Exploration object.

@ -19,11 +23,10 @@ class SoftQ(StochasticSampling):
            temperature (Schedule): The temperature to divide model outputs by
                before creating the Categorical distribution to sample from.
            framework (Optional[str]): One of None, "tf", "torch".
-            kwargs (dict): Passed on to super constructor.
        """
        assert isinstance(action_space, Discrete)
        super().__init__(
-            action_space=action_space,
+            action_space,
            static_params=dict(temperature=temperature),
            framework=framework,
            **kwargs)
--- a/rllib/utils/exploration/stochastic_sampling.py
+++ b/rllib/utils/exploration/stochastic_sampling.py
@ -18,24 +18,24 @@ class StochasticSampling(Exploration):

    def __init__(self,
                 action_space,
-                 framework="tf",
+                 *,
                 static_params=None,
                 time_dependent_params=None,
+                 framework="tf",
                 **kwargs):
        """Initializes a StochasticSampling Exploration object.

        Args:
            action_space (Space): The gym action space used by the environment.
-            framework (Optional[str]): One of None, "tf", "torch".
            static_params (Optional[dict]): Parameters to be passed as-is into
                the action distribution class' constructor.
            time_dependent_params (dict): Parameters to be evaluated based on
                `timestep` and then passed into the action distribution
                class' constructor.
+            framework (Optional[str]): One of None, "tf", "torch".
        """
        assert framework is not None
-        super().__init__(
-            action_space=action_space, framework=framework, **kwargs)
+        super().__init__(action_space, framework=framework, **kwargs)

        self.static_params = static_params or {}

--- a/rllib/utils/schedules/piecewise_schedule.py
+++ b/rllib/utils/schedules/piecewise_schedule.py
@ -47,6 +47,6 @@ class PiecewiseSchedule(Schedule):
                alpha = float(t - l_t) / (r_t - l_t)
                return self.interpolation(l, r, alpha)

-        # t does not belong to any of the pieces, so doom.
+        # t does not belong to any of the pieces, return `self.outside_value`.
        assert self.outside_value is not None
        return self.outside_value
--- a/rllib/utils/schedules/schedule.py
+++ b/rllib/utils/schedules/schedule.py
@ -39,11 +39,11 @@ class Schedule(metaclass=ABCMeta):
        raise NotImplementedError

    def value(self, t):
-        if self.framework == "tf" and tf.executing_eagerly() is False:
+        if self.framework == "tf":
            return tf.cast(
-                tf.py_func(self._value, [t], tf.float64),
+                tf.py_function(self._value, [t], tf.float64),
                tf.float32,
-                name="schedule-value")
+                name="schedule_value")
        return self._value(t)

    def __call__(self, t):