ray/rllib/utils/exploration/tests/test_parameter_noise.py

import numpy as np
import unittest

import ray
import ray.rllib.algorithms.ddpg as ddpg
import ray.rllib.algorithms.dqn as dqn
from ray.rllib.utils.test_utils import check, framework_iterator


class TestParameterNoise(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        ray.init(num_cpus=4)

    @classmethod
    def tearDownClass(cls) -> None:
        ray.shutdown()

    def test_ddpg_parameter_noise(self):
        self.do_test_parameter_noise_exploration(
            ddpg.DDPG,
            ddpg.DEFAULT_CONFIG,
            "Pendulum-v1",
            {},
            np.array([1.0, 0.0, -1.0]),
        )

    def test_dqn_parameter_noise(self):
        self.do_test_parameter_noise_exploration(
            dqn.DQN,
            dqn.DEFAULT_CONFIG,
            "FrozenLake-v1",
            {"is_slippery": False, "map_name": "4x4"},
            np.array(0),
        )

    def do_test_parameter_noise_exploration(
        self, trainer_cls, config, env, env_config, obs
    ):
        """Tests, whether an Agent works with ParameterNoise."""
        core_config = config.copy()
        core_config["num_workers"] = 0  # Run locally.
        core_config["env_config"] = env_config

        for fw in framework_iterator(core_config):
            config = core_config.copy()

            # Algo with ParameterNoise exploration (config["explore"]=True).
            # ----
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = True

            trainer = trainer_cls(config=config, env=env)
            policy = trainer.get_policy()
            pol_sess = policy.get_session()
            # Remove noise that has been added during policy initialization
            # (exploration.postprocess_trajectory does add noise to measure
            # the delta).
            policy.exploration._remove_noise(tf_sess=pol_sess)

            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_before = self._get_current_noise(policy, fw)
            check(noise_before, 0.0)
            initial_weights = self._get_current_weight(policy, fw)

            # Pseudo-start an episode and compare the weights before and after.
            policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_after_ep_start = self._get_current_noise(policy, fw)
            weights_after_ep_start = self._get_current_weight(policy, fw)
            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            check(noise_after_ep_start, noise_before)
            check(initial_weights, weights_after_ep_start)

            # Setting explore=False should always return the same action.
            a_ = trainer.compute_single_action(obs, explore=False)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise = self._get_current_noise(policy, fw)
            # We sampled the first noise (not zero anymore).
            check(noise, 0.0, false=True)
            # But still not applied b/c explore=False.
            check(self._get_current_weight(policy, fw), initial_weights)
            for _ in range(10):
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)
                # Noise never gets applied.
                check(self._get_current_weight(policy, fw), initial_weights)
                self.assertFalse(policy.exploration.weights_are_currently_noisy)

            # Explore=None (default: True) should return different actions.
            # However, this is only due to the underlying epsilon-greedy
            # exploration.
            actions = []
            current_weight = None
            for _ in range(10):
                actions.append(trainer.compute_single_action(obs))
                self.assertTrue(policy.exploration.weights_are_currently_noisy)
                # Now, noise actually got applied (explore=True).
                current_weight = self._get_current_weight(policy, fw)
                check(current_weight, initial_weights, false=True)
                check(current_weight, initial_weights + noise)
            check(np.std(actions), 0.0, false=True)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones.
            policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
            weights_after_ep_end = self._get_current_weight(policy, fw)
            check(current_weight - noise, weights_after_ep_end, decimals=5)
            trainer.stop()

            # DQN with ParameterNoise exploration (config["explore"]=False).
            # ----
            config = core_config.copy()
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = False
            trainer = trainer_cls(config=config, env=env)
            policy = trainer.get_policy()
            pol_sess = policy.get_session()
            # Remove noise that has been added during policy initialization
            # (exploration.postprocess_trajectory does add noise to measure
            # the delta).
            policy.exploration._remove_noise(tf_sess=pol_sess)

            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            initial_weights = self._get_current_weight(policy, fw)

            # Noise before anything (should be 0.0, no episode started yet).
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)

            # Pseudo-start an episode and compare the weights before and after
            # (they should be the same).
            policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)

            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)
            noisy_weights = self._get_current_weight(policy, fw)
            check(initial_weights, noisy_weights)

            # Setting explore=False or None should always return the same
            # action.
            a_ = trainer.compute_single_action(obs, explore=False)
            # Now we have re-sampled.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0, false=True)
            for _ in range(5):
                a = trainer.compute_single_action(obs, explore=None)
                check(a, a_)
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones (no noise permanently
            # applied throughout the episode).
            policy.exploration.on_episode_end(policy, tf_sess=pol_sess)
            weights_after_episode_end = self._get_current_weight(policy, fw)
            check(initial_weights, weights_after_episode_end)
            # Noise should still be the same (re-sampling only happens at
            # beginning of episode).
            noise_after = self._get_current_noise(policy, fw)
            check(noise, noise_after)
            trainer.stop()

            # Switch off underlying exploration entirely.
            # ----
            config = core_config.copy()
            if trainer_cls is dqn.DQN:
                sub_config = {
                    "type": "EpsilonGreedy",
                    "initial_epsilon": 0.0,  # <- no randomness whatsoever
                    "final_epsilon": 0.0,
                }
            else:
                sub_config = {
                    "type": "OrnsteinUhlenbeckNoise",
                    "initial_scale": 0.0,  # <- no randomness whatsoever
                    "final_scale": 0.0,
                    "random_timesteps": 0,
                }
            config["exploration_config"] = {
                "type": "ParameterNoise",
                "sub_exploration": sub_config,
            }
            config["explore"] = True
            trainer = trainer_cls(config=config, env=env)
            # Now, when we act - even with explore=True - we would expect
            # the same action for the same input (parameter noise is
            # deterministic).
            policy = trainer.get_policy()
            policy.exploration.on_episode_start(policy, tf_sess=pol_sess)
            a_ = trainer.compute_single_action(obs)
            for _ in range(10):
                a = trainer.compute_single_action(obs, explore=True)
                check(a, a_)
            trainer.stop()

    def _get_current_noise(self, policy, fw):
        # If noise not even created yet, return 0.0.
        if policy.exploration.noise is None:
            return 0.0

        noise = policy.exploration.noise[0][0][0]
        if fw == "tf":
            noise = policy.get_session().run(noise)
        elif fw == "torch":
            noise = noise.detach().cpu().numpy()
        else:
            noise = noise.numpy()
        return noise

    def _get_current_weight(self, policy, fw):
        weights = policy.get_weights()
        if fw == "torch":
            # DQN model.
            if "_hidden_layers.0._model.0.weight" in weights:
                return weights["_hidden_layers.0._model.0.weight"][0][0]
            # DDPG model.
            else:
                return weights["policy_model.action_0._model.0.weight"][0][0]
        key = 0 if fw in ["tf2", "tfe"] else list(weights.keys())[0]
        return weights[key][0][0]


if __name__ == "__main__":
    import pytest
    import sys

    sys.exit(pytest.main(["-v", __file__]))
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`import numpy as np`
			`import unittest`

[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00			`import ray`
[RLlib] Agents to algos: DQN w/o Apex and R2D2, DDPG/TD3, SAC, SlateQ, QMIX, PG, Bandits (#24896) 2022-05-19 09:30:42 -07:00			`import ray.rllib.algorithms.ddpg as ddpg`
			`import ray.rllib.algorithms.dqn as dqn`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`from ray.rllib.utils.test_utils import check, framework_iterator`


			`class TestParameterNoise(unittest.TestCase):`
[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00			`@classmethod`
			`def setUpClass(cls) -> None:`
[RLlib] Fix param noise test case on CI. (#14926) 2021-03-25 12:33:23 +01:00			`ray.init(num_cpus=4)`
[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00
			`@classmethod`
			`def tearDownClass(cls) -> None:`
			`ray.shutdown()`

[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`def test_ddpg_parameter_noise(self):`
			`self.do_test_parameter_noise_exploration(`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`ddpg.DDPG,`
[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (#19535) * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 * Reformatting * Fixing tests * Move atari-py install conditional to req.txt * migrate to new ale install method * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 Move atari-py install conditional to req.txt migrate to new ale install method Make parametric_actions_cartpole return float32 actions/obs Adding type conversions if obs/actions don't match space Add utils to make elements match gym space dtypes Co-authored-by: Jun Gong <jungong@anyscale.com> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-03 08:24:00 -07:00			`ddpg.DEFAULT_CONFIG,`
			`"Pendulum-v1",`
			`{},`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`np.array([1.0, 0.0, -1.0]),`
			`)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00
			`def test_dqn_parameter_noise(self):`
			`self.do_test_parameter_noise_exploration(`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`dqn.DQN,`
[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (#19535) * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 * Reformatting * Fixing tests * Move atari-py install conditional to req.txt * migrate to new ale install method * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 Move atari-py install conditional to req.txt migrate to new ale install method Make parametric_actions_cartpole return float32 actions/obs Adding type conversions if obs/actions don't match space Add utils to make elements match gym space dtypes Co-authored-by: Jun Gong <jungong@anyscale.com> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-03 08:24:00 -07:00			`dqn.DEFAULT_CONFIG,`
			`"FrozenLake-v1",`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`{"is_slippery": False, "map_name": "4x4"},`
ci: Redo `format.sh --all` script & backfill lint fixes (#9956) 2020-08-07 16:49:49 -07:00			`np.array(0),`
			`)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00
ci: Redo `format.sh --all` script & backfill lint fixes (#9956) 2020-08-07 16:49:49 -07:00			`def do_test_parameter_noise_exploration(`
			`self, trainer_cls, config, env, env_config, obs`
			`):`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`"""Tests, whether an Agent works with ParameterNoise."""`
			`core_config = config.copy()`
			`core_config["num_workers"] = 0 # Run locally.`
			`core_config["env_config"] = env_config`

[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`for fw in framework_iterator(core_config):`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`config = core_config.copy()`

[RLlib] DDPG PyTorch version. (#7953) The DDPG/TD3 algorithms currently do not have a PyTorch implementation. This PR adds PyTorch support for DDPG/TD3 to RLlib. This PR: - Depends on the re-factor PR for DDPG (Functional Algorithm API). - Adds learning regression tests for the PyTorch version of DDPG and a DDPG (torch) - Updates the documentation to reflect that DDPG and TD3 now support PyTorch. * Learning Pendulum-v0 on torch version (same config as tf). Wall time a little slower (~20% than tf). * Fix GPU target model problem. 2020-04-16 10:20:01 +02:00			`# Algo with ParameterNoise exploration (config["explore"]=True).`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`# ----`
			`config["exploration_config"] = {"type": "ParameterNoise"}`
			`config["explore"] = True`

			`trainer = trainer_cls(config=config, env=env)`
			`policy = trainer.get_policy()`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`pol_sess = policy.get_session()`
[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00			`# Remove noise that has been added during policy initialization`
			`# (exploration.postprocess_trajectory does add noise to measure`
			`# the delta).`
			`policy.exploration._remove_noise(tf_sess=pol_sess)`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`
			`noise_before = self._get_current_noise(policy, fw)`
			`check(noise_before, 0.0)`
			`initial_weights = self._get_current_weight(policy, fw)`

			`# Pseudo-start an episode and compare the weights before and after.`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`policy.exploration.on_episode_start(policy, tf_sess=pol_sess)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`
			`noise_after_ep_start = self._get_current_noise(policy, fw)`
			`weights_after_ep_start = self._get_current_weight(policy, fw)`
			`# Should be the same, as we don't do anything at the beginning of`
			`# the episode, only one step later.`
			`check(noise_after_ep_start, noise_before)`
			`check(initial_weights, weights_after_ep_start)`

			`# Setting explore=False should always return the same action.`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a_ = trainer.compute_single_action(obs, explore=False)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`
			`noise = self._get_current_noise(policy, fw)`
			`# We sampled the first noise (not zero anymore).`
			`check(noise, 0.0, false=True)`
			`# But still not applied b/c explore=False.`
			`check(self._get_current_weight(policy, fw), initial_weights)`
			`for _ in range(10):`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a = trainer.compute_single_action(obs, explore=False)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`check(a, a_)`
			`# Noise never gets applied.`
			`check(self._get_current_weight(policy, fw), initial_weights)`
			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`

			`# Explore=None (default: True) should return different actions.`
			`# However, this is only due to the underlying epsilon-greedy`
			`# exploration.`
			`actions = []`
			`current_weight = None`
			`for _ in range(10):`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`actions.append(trainer.compute_single_action(obs))`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertTrue(policy.exploration.weights_are_currently_noisy)`
			`# Now, noise actually got applied (explore=True).`
			`current_weight = self._get_current_weight(policy, fw)`
			`check(current_weight, initial_weights, false=True)`
			`check(current_weight, initial_weights + noise)`
			`check(np.std(actions), 0.0, false=True)`

			`# Pseudo-end the episode and compare weights again.`
			`# Make sure they are the original ones.`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`policy.exploration.on_episode_end(policy, tf_sess=pol_sess)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`weights_after_ep_end = self._get_current_weight(policy, fw)`
			`check(current_weight - noise, weights_after_ep_end, decimals=5)`
[RLlib] Fix param noise test case on CI. (#14926) 2021-03-25 12:33:23 +01:00			`trainer.stop()`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00
			`# DQN with ParameterNoise exploration (config["explore"]=False).`
			`# ----`
			`config = core_config.copy()`
			`config["exploration_config"] = {"type": "ParameterNoise"}`
			`config["explore"] = False`
			`trainer = trainer_cls(config=config, env=env)`
			`policy = trainer.get_policy()`
[RLlib] Implement policy_maps (multi-agent case) in RolloutWorkers as LRU caches. (#17031) 2021-07-19 13:16:03 -04:00			`pol_sess = policy.get_session()`
[RLlib] Attention Net prep PR #3. (#12450) 2020-12-07 13:08:17 +01:00			`# Remove noise that has been added during policy initialization`
			`# (exploration.postprocess_trajectory does add noise to measure`
			`# the delta).`
			`policy.exploration._remove_noise(tf_sess=pol_sess)`

[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`
			`initial_weights = self._get_current_weight(policy, fw)`

			`# Noise before anything (should be 0.0, no episode started yet).`
			`noise = self._get_current_noise(policy, fw)`
			`check(noise, 0.0)`

			`# Pseudo-start an episode and compare the weights before and after`
			`# (they should be the same).`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`policy.exploration.on_episode_start(policy, tf_sess=pol_sess)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`self.assertFalse(policy.exploration.weights_are_currently_noisy)`

			`# Should be the same, as we don't do anything at the beginning of`
			`# the episode, only one step later.`
			`noise = self._get_current_noise(policy, fw)`
			`check(noise, 0.0)`
			`noisy_weights = self._get_current_weight(policy, fw)`
			`check(initial_weights, noisy_weights)`

			`# Setting explore=False or None should always return the same`
			`# action.`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a_ = trainer.compute_single_action(obs, explore=False)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`# Now we have re-sampled.`
			`noise = self._get_current_noise(policy, fw)`
			`check(noise, 0.0, false=True)`
			`for _ in range(5):`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a = trainer.compute_single_action(obs, explore=None)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`check(a, a_)`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a = trainer.compute_single_action(obs, explore=False)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`check(a, a_)`

			`# Pseudo-end the episode and compare weights again.`
			`# Make sure they are the original ones (no noise permanently`
			`# applied throughout the episode).`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`policy.exploration.on_episode_end(policy, tf_sess=pol_sess)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`weights_after_episode_end = self._get_current_weight(policy, fw)`
			`check(initial_weights, weights_after_episode_end)`
			`# Noise should still be the same (re-sampling only happens at`
			`# beginning of episode).`
			`noise_after = self._get_current_noise(policy, fw)`
			`check(noise, noise_after)`
[RLlib] Fix param noise test case on CI. (#14926) 2021-03-25 12:33:23 +01:00			`trainer.stop()`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00
			`# Switch off underlying exploration entirely.`
			`# ----`
			`config = core_config.copy()`
[RLlib] Move all remaining algos into `algorithms` directory. (#25366) 2022-06-04 07:35:24 +02:00			`if trainer_cls is dqn.DQN:`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`sub_config = {`
			`"type": "EpsilonGreedy",`
			`"initial_epsilon": 0.0, # <- no randomness whatsoever`
			`"final_epsilon": 0.0,`
			`}`
			`else:`
			`sub_config = {`
			`"type": "OrnsteinUhlenbeckNoise",`
			`"initial_scale": 0.0, # <- no randomness whatsoever`
			`"final_scale": 0.0,`
			`"random_timesteps": 0,`
			`}`
			`config["exploration_config"] = {`
			`"type": "ParameterNoise",`
			`"sub_exploration": sub_config,`
			`}`
			`config["explore"] = True`
			`trainer = trainer_cls(config=config, env=env)`
			`# Now, when we act - even with explore=True - we would expect`
			`# the same action for the same input (parameter noise is`
			`# deterministic).`
			`policy = trainer.get_policy()`
[RLlib] DDPG and SAC eager support (preparation for tf2.x) (#9204) 2020-07-08 16:12:20 +02:00			`policy.exploration.on_episode_start(policy, tf_sess=pol_sess)`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a_ = trainer.compute_single_action(obs)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`for _ in range(10):`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`a = trainer.compute_single_action(obs, explore=True)`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`check(a, a_)`
[RLlib] Fix param noise test case on CI. (#14926) 2021-03-25 12:33:23 +01:00			`trainer.stop()`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00
			`def _get_current_noise(self, policy, fw):`
			`# If noise not even created yet, return 0.0.`
			`if policy.exploration.noise is None:`
			`return 0.0`

			`noise = policy.exploration.noise[0][0][0]`
			`if fw == "tf":`
			`noise = policy.get_session().run(noise)`
[RLlib] MB-MPO cleanup (comments, docstrings, type annotations). (#11033) 2020-10-06 20:28:16 +02:00			`elif fw == "torch":`
			`noise = noise.detach().cpu().numpy()`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`else:`
			`noise = noise.numpy()`
			`return noise`

			`def _get_current_weight(self, policy, fw):`
			`weights = policy.get_weights()`
[RLlib] MB-MPO cleanup (comments, docstrings, type annotations). (#11033) 2020-10-06 20:28:16 +02:00			`if fw == "torch":`
			`# DQN model.`
			`if "_hidden_layers.0._model.0.weight" in weights:`
			`return weights["_hidden_layers.0._model.0.weight"][0][0]`
			`# DDPG model.`
			`else:`
			`return weights["policy_model.action_0._model.0.weight"][0][0]`
[RLlib] Tf2.x native. (#8752) 2020-07-11 22:06:35 +02:00			`key = 0 if fw in ["tf2", "tfe"] else list(weights.keys())[0]`
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`return weights[key][0][0]`


			`if __name__ == "__main__":`
			`import pytest`
			`import sys`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00
[RLlib] DDPG re-factor to fit into RLlib's functional algorithm builder API. (#7934) 2020-04-09 23:04:21 +02:00			`sys.exit(pytest.main(["-v", __file__]))`