2020-04-09 23:04:21 +02:00
|
|
|
import numpy as np
|
|
|
|
import unittest
|
|
|
|
|
2020-12-07 13:08:17 +01:00
|
|
|
import ray
|
2022-05-19 09:30:42 -07:00
|
|
|
import ray.rllib.algorithms.ddpg as ddpg
|
|
|
|
import ray.rllib.algorithms.dqn as dqn
|
2020-04-09 23:04:21 +02:00
|
|
|
from ray.rllib.utils.test_utils import check, framework_iterator
|
|
|
|
|
|
|
|
|
|
|
|
class TestParameterNoise(unittest.TestCase):
|
2020-12-07 13:08:17 +01:00
|
|
|
@classmethod
|
|
|
|
def setUpClass(cls) -> None:
|
2021-03-25 12:33:23 +01:00
|
|
|
ray.init(num_cpus=4)
|
2020-12-07 13:08:17 +01:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def tearDownClass(cls) -> None:
|
|
|
|
ray.shutdown()
|
|
|
|
|
2020-04-09 23:04:21 +02:00
|
|
|
def test_ddpg_parameter_noise(self):
|
|
|
|
self.do_test_parameter_noise_exploration(
|
2022-06-04 07:35:24 +02:00
|
|
|
ddpg.DDPG,
|
[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (#19535)
* Fix QMix, SAC, and MADDPA too.
* Unpin gym and deprecate pendulum v0
Many tests in rllib depended on pendulum v0,
however in gym 0.21, pendulum v0 was deprecated
in favor of pendulum v1. This may change reward
thresholds, so will have to potentially rerun
all of the pendulum v1 benchmarks, or use another
environment in favor. The same applies to frozen
lake v0 and frozen lake v1
Lastly, all of the RLlib tests and have
been moved to python 3.7
* Add gym installation based on python version.
Pin python<= 3.6 to gym 0.19 due to install
issues with atari roms in gym 0.20
* Reformatting
* Fixing tests
* Move atari-py install conditional to req.txt
* migrate to new ale install method
* Fix QMix, SAC, and MADDPA too.
* Unpin gym and deprecate pendulum v0
Many tests in rllib depended on pendulum v0,
however in gym 0.21, pendulum v0 was deprecated
in favor of pendulum v1. This may change reward
thresholds, so will have to potentially rerun
all of the pendulum v1 benchmarks, or use another
environment in favor. The same applies to frozen
lake v0 and frozen lake v1
Lastly, all of the RLlib tests and have
been moved to python 3.7
* Add gym installation based on python version.
Pin python<= 3.6 to gym 0.19 due to install
issues with atari roms in gym 0.20
Move atari-py install conditional to req.txt
migrate to new ale install method
Make parametric_actions_cartpole return float32 actions/obs
Adding type conversions if obs/actions don't match space
Add utils to make elements match gym space dtypes
Co-authored-by: Jun Gong <jungong@anyscale.com>
Co-authored-by: sven1977 <svenmika1977@gmail.com>
2021-11-03 08:24:00 -07:00
|
|
|
ddpg.DEFAULT_CONFIG,
|
|
|
|
"Pendulum-v1",
|
|
|
|
{},
|
2020-07-08 16:12:20 +02:00
|
|
|
np.array([1.0, 0.0, -1.0]),
|
|
|
|
)
|
2020-04-09 23:04:21 +02:00
|
|
|
|
|
|
|
def test_dqn_parameter_noise(self):
|
|
|
|
self.do_test_parameter_noise_exploration(
|
2022-06-04 07:35:24 +02:00
|
|
|
dqn.DQN,
|
[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (#19535)
* Fix QMix, SAC, and MADDPA too.
* Unpin gym and deprecate pendulum v0
Many tests in rllib depended on pendulum v0,
however in gym 0.21, pendulum v0 was deprecated
in favor of pendulum v1. This may change reward
thresholds, so will have to potentially rerun
all of the pendulum v1 benchmarks, or use another
environment in favor. The same applies to frozen
lake v0 and frozen lake v1
Lastly, all of the RLlib tests and have
been moved to python 3.7
* Add gym installation based on python version.
Pin python<= 3.6 to gym 0.19 due to install
issues with atari roms in gym 0.20
* Reformatting
* Fixing tests
* Move atari-py install conditional to req.txt
* migrate to new ale install method
* Fix QMix, SAC, and MADDPA too.
* Unpin gym and deprecate pendulum v0
Many tests in rllib depended on pendulum v0,
however in gym 0.21, pendulum v0 was deprecated
in favor of pendulum v1. This may change reward
thresholds, so will have to potentially rerun
all of the pendulum v1 benchmarks, or use another
environment in favor. The same applies to frozen
lake v0 and frozen lake v1
Lastly, all of the RLlib tests and have
been moved to python 3.7
* Add gym installation based on python version.
Pin python<= 3.6 to gym 0.19 due to install
issues with atari roms in gym 0.20
Move atari-py install conditional to req.txt
migrate to new ale install method
Make parametric_actions_cartpole return float32 actions/obs
Adding type conversions if obs/actions don't match space
Add utils to make elements match gym space dtypes
Co-authored-by: Jun Gong <jungong@anyscale.com>
Co-authored-by: sven1977 <svenmika1977@gmail.com>
2021-11-03 08:24:00 -07:00
|
|
|
dqn.DEFAULT_CONFIG,
|
|
|
|
"FrozenLake-v1",
|
2020-04-09 23:04:21 +02:00
|
|
|
{"is_slippery": False, "map_name": "4x4"},
|
2020-08-07 16:49:49 -07:00
|
|
|
np.array(0),
|
|
|
|
)
|
2022-01-29 18:41:57 -08:00
|
|
|
|
2020-08-07 16:49:49 -07:00
|
|
|
def do_test_parameter_noise_exploration(
|
|
|
|
self, trainer_cls, config, env, env_config, obs
|
|
|
|
):
|
2020-04-09 23:04:21 +02:00
|
|
|
"""Tests, whether an Agent works with ParameterNoise."""
|
|
|
|
core_config = config.copy()
|
|
|
|
core_config["num_workers"] = 0 # Run locally.
|
|
|
|
core_config["env_config"] = env_config
|
|
|
|
|
2020-07-08 16:12:20 +02:00
|
|
|
for fw in framework_iterator(core_config):
|
2020-04-09 23:04:21 +02:00
|
|
|
config = core_config.copy()
|
|
|
|
|
2020-04-16 10:20:01 +02:00
|
|
|
# Algo with ParameterNoise exploration (config["explore"]=True).
|
2020-04-09 23:04:21 +02:00
|
|
|
# ----
|
|
|
|
config["exploration_config"] = {"type": "ParameterNoise"}
|
|
|
|
config["explore"] = True
|
|
|
|
|
|
|
|
trainer = trainer_cls(config=config, env=env)
|
|
|
|
policy = trainer.get_policy()
|
2021-07-19 13:16:03 -04:00
|
|
|
pol_sess = policy.get_session()
|
2020-12-07 13:08:17 +01:00
|
|
|
# Remove noise that has been added during policy initialization
|
|
|
|
# (exploration.postprocess_trajectory does add noise to measure
|
|
|
|
# the delta).
|
|
|
|
policy.exploration._remove_noise(tf_sess=pol_sess)
|
2020-07-08 16:12:20 +02:00
|
|
|
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
noise_before = self._get_current_noise(policy, fw)
|
|
|
|
check(noise_before, 0.0)
|
|
|
|
initial_weights = self._get_current_weight(policy, fw)
|
|
|
|
|
|
|
|
# Pseudo-start an episode and compare the weights before and after.
|
2020-07-08 16:12:20 +02:00
|
|
|
policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
noise_after_ep_start = self._get_current_noise(policy, fw)
|
|
|
|
weights_after_ep_start = self._get_current_weight(policy, fw)
|
|
|
|
# Should be the same, as we don't do anything at the beginning of
|
|
|
|
# the episode, only one step later.
|
|
|
|
check(noise_after_ep_start, noise_before)
|
|
|
|
check(initial_weights, weights_after_ep_start)
|
|
|
|
|
|
|
|
# Setting explore=False should always return the same action.
|
2021-06-30 12:32:11 +02:00
|
|
|
a_ = trainer.compute_single_action(obs, explore=False)
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
noise = self._get_current_noise(policy, fw)
|
|
|
|
# We sampled the first noise (not zero anymore).
|
|
|
|
check(noise, 0.0, false=True)
|
|
|
|
# But still not applied b/c explore=False.
|
|
|
|
check(self._get_current_weight(policy, fw), initial_weights)
|
|
|
|
for _ in range(10):
|
2021-06-30 12:32:11 +02:00
|
|
|
a = trainer.compute_single_action(obs, explore=False)
|
2020-04-09 23:04:21 +02:00
|
|
|
check(a, a_)
|
|
|
|
# Noise never gets applied.
|
|
|
|
check(self._get_current_weight(policy, fw), initial_weights)
|
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
|
|
|
|
# Explore=None (default: True) should return different actions.
|
|
|
|
# However, this is only due to the underlying epsilon-greedy
|
|
|
|
# exploration.
|
|
|
|
actions = []
|
|
|
|
current_weight = None
|
|
|
|
for _ in range(10):
|
2021-06-30 12:32:11 +02:00
|
|
|
actions.append(trainer.compute_single_action(obs))
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertTrue(policy.exploration.weights_are_currently_noisy)
|
|
|
|
# Now, noise actually got applied (explore=True).
|
|
|
|
current_weight = self._get_current_weight(policy, fw)
|
|
|
|
check(current_weight, initial_weights, false=True)
|
|
|
|
check(current_weight, initial_weights + noise)
|
|
|
|
check(np.std(actions), 0.0, false=True)
|
|
|
|
|
|
|
|
# Pseudo-end the episode and compare weights again.
|
|
|
|
# Make sure they are the original ones.
|
2020-07-08 16:12:20 +02:00
|
|
|
policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
|
2020-04-09 23:04:21 +02:00
|
|
|
weights_after_ep_end = self._get_current_weight(policy, fw)
|
|
|
|
check(current_weight - noise, weights_after_ep_end, decimals=5)
|
2021-03-25 12:33:23 +01:00
|
|
|
trainer.stop()
|
2020-04-09 23:04:21 +02:00
|
|
|
|
|
|
|
# DQN with ParameterNoise exploration (config["explore"]=False).
|
|
|
|
# ----
|
|
|
|
config = core_config.copy()
|
|
|
|
config["exploration_config"] = {"type": "ParameterNoise"}
|
|
|
|
config["explore"] = False
|
|
|
|
trainer = trainer_cls(config=config, env=env)
|
|
|
|
policy = trainer.get_policy()
|
2021-07-19 13:16:03 -04:00
|
|
|
pol_sess = policy.get_session()
|
2020-12-07 13:08:17 +01:00
|
|
|
# Remove noise that has been added during policy initialization
|
|
|
|
# (exploration.postprocess_trajectory does add noise to measure
|
|
|
|
# the delta).
|
|
|
|
policy.exploration._remove_noise(tf_sess=pol_sess)
|
|
|
|
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
initial_weights = self._get_current_weight(policy, fw)
|
|
|
|
|
|
|
|
# Noise before anything (should be 0.0, no episode started yet).
|
|
|
|
noise = self._get_current_noise(policy, fw)
|
|
|
|
check(noise, 0.0)
|
|
|
|
|
|
|
|
# Pseudo-start an episode and compare the weights before and after
|
|
|
|
# (they should be the same).
|
2020-07-08 16:12:20 +02:00
|
|
|
policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
|
2020-04-09 23:04:21 +02:00
|
|
|
self.assertFalse(policy.exploration.weights_are_currently_noisy)
|
|
|
|
|
|
|
|
# Should be the same, as we don't do anything at the beginning of
|
|
|
|
# the episode, only one step later.
|
|
|
|
noise = self._get_current_noise(policy, fw)
|
|
|
|
check(noise, 0.0)
|
|
|
|
noisy_weights = self._get_current_weight(policy, fw)
|
|
|
|
check(initial_weights, noisy_weights)
|
|
|
|
|
|
|
|
# Setting explore=False or None should always return the same
|
|
|
|
# action.
|
2021-06-30 12:32:11 +02:00
|
|
|
a_ = trainer.compute_single_action(obs, explore=False)
|
2020-04-09 23:04:21 +02:00
|
|
|
# Now we have re-sampled.
|
|
|
|
noise = self._get_current_noise(policy, fw)
|
|
|
|
check(noise, 0.0, false=True)
|
|
|
|
for _ in range(5):
|
2021-06-30 12:32:11 +02:00
|
|
|
a = trainer.compute_single_action(obs, explore=None)
|
2020-04-09 23:04:21 +02:00
|
|
|
check(a, a_)
|
2021-06-30 12:32:11 +02:00
|
|
|
a = trainer.compute_single_action(obs, explore=False)
|
2020-04-09 23:04:21 +02:00
|
|
|
check(a, a_)
|
|
|
|
|
|
|
|
# Pseudo-end the episode and compare weights again.
|
|
|
|
# Make sure they are the original ones (no noise permanently
|
|
|
|
# applied throughout the episode).
|
2020-07-08 16:12:20 +02:00
|
|
|
policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
|
2020-04-09 23:04:21 +02:00
|
|
|
weights_after_episode_end = self._get_current_weight(policy, fw)
|
|
|
|
check(initial_weights, weights_after_episode_end)
|
|
|
|
# Noise should still be the same (re-sampling only happens at
|
|
|
|
# beginning of episode).
|
|
|
|
noise_after = self._get_current_noise(policy, fw)
|
|
|
|
check(noise, noise_after)
|
2021-03-25 12:33:23 +01:00
|
|
|
trainer.stop()
|
2020-04-09 23:04:21 +02:00
|
|
|
|
|
|
|
# Switch off underlying exploration entirely.
|
|
|
|
# ----
|
|
|
|
config = core_config.copy()
|
2022-06-04 07:35:24 +02:00
|
|
|
if trainer_cls is dqn.DQN:
|
2020-04-09 23:04:21 +02:00
|
|
|
sub_config = {
|
|
|
|
"type": "EpsilonGreedy",
|
|
|
|
"initial_epsilon": 0.0, # <- no randomness whatsoever
|
|
|
|
"final_epsilon": 0.0,
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
sub_config = {
|
|
|
|
"type": "OrnsteinUhlenbeckNoise",
|
|
|
|
"initial_scale": 0.0, # <- no randomness whatsoever
|
|
|
|
"final_scale": 0.0,
|
|
|
|
"random_timesteps": 0,
|
|
|
|
}
|
|
|
|
config["exploration_config"] = {
|
|
|
|
"type": "ParameterNoise",
|
|
|
|
"sub_exploration": sub_config,
|
|
|
|
}
|
|
|
|
config["explore"] = True
|
|
|
|
trainer = trainer_cls(config=config, env=env)
|
|
|
|
# Now, when we act - even with explore=True - we would expect
|
|
|
|
# the same action for the same input (parameter noise is
|
|
|
|
# deterministic).
|
|
|
|
policy = trainer.get_policy()
|
2020-07-08 16:12:20 +02:00
|
|
|
policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
|
2021-06-30 12:32:11 +02:00
|
|
|
a_ = trainer.compute_single_action(obs)
|
2020-04-09 23:04:21 +02:00
|
|
|
for _ in range(10):
|
2021-06-30 12:32:11 +02:00
|
|
|
a = trainer.compute_single_action(obs, explore=True)
|
2020-04-09 23:04:21 +02:00
|
|
|
check(a, a_)
|
2021-03-25 12:33:23 +01:00
|
|
|
trainer.stop()
|
2020-04-09 23:04:21 +02:00
|
|
|
|
|
|
|
def _get_current_noise(self, policy, fw):
|
|
|
|
# If noise not even created yet, return 0.0.
|
|
|
|
if policy.exploration.noise is None:
|
|
|
|
return 0.0
|
|
|
|
|
|
|
|
noise = policy.exploration.noise[0][0][0]
|
|
|
|
if fw == "tf":
|
|
|
|
noise = policy.get_session().run(noise)
|
2020-10-06 20:28:16 +02:00
|
|
|
elif fw == "torch":
|
|
|
|
noise = noise.detach().cpu().numpy()
|
2020-04-09 23:04:21 +02:00
|
|
|
else:
|
|
|
|
noise = noise.numpy()
|
|
|
|
return noise
|
|
|
|
|
|
|
|
def _get_current_weight(self, policy, fw):
|
|
|
|
weights = policy.get_weights()
|
2020-10-06 20:28:16 +02:00
|
|
|
if fw == "torch":
|
|
|
|
# DQN model.
|
|
|
|
if "_hidden_layers.0._model.0.weight" in weights:
|
|
|
|
return weights["_hidden_layers.0._model.0.weight"][0][0]
|
|
|
|
# DDPG model.
|
|
|
|
else:
|
|
|
|
return weights["policy_model.action_0._model.0.weight"][0][0]
|
2020-07-11 22:06:35 +02:00
|
|
|
key = 0 if fw in ["tf2", "tfe"] else list(weights.keys())[0]
|
2020-04-09 23:04:21 +02:00
|
|
|
return weights[key][0][0]
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import pytest
|
|
|
|
import sys
|
2022-01-29 18:41:57 -08:00
|
|
|
|
2020-04-09 23:04:21 +02:00
|
|
|
sys.exit(pytest.main(["-v", __file__]))
|