ray/rllib/policy/tests/test_compute_log_likelihoods.py

import numpy as np
from scipy.stats import norm
import unittest

import ray.rllib.agents.dqn as dqn
import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.sac as sac
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check
from ray.rllib.utils.numpy import one_hot, fc, MIN_LOG_NN_OUTPUT, \
    MAX_LOG_NN_OUTPUT

tf = try_import_tf()


def test_log_likelihood(run,
                        config,
                        prev_a=None,
                        continuous=False,
                        layer_key=("fc", (0, 4)),
                        logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    # Use Soft-Q for DQNs.
    if run is dqn.DQNTrainer:
        config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5}

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in ["tf", "eager", "torch"]:
        if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch":
            continue
        print("Testing {} with framework={}".format(run, fw))
        config["eager"] = True if fw == "eager" else False
        config["use_pytorch"] = True if fw == "torch" else False

        trainer = run(config=config, env=env)
        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 500
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(
                    obs_batch[0],
                    prev_action=prev_a,
                    prev_reward=prev_r,
                    explore=True))

        # Test 50 actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(50):
                a = actions[idx]
                if fw == "tf" or fw == "eager":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["_hidden_layers.0._model.0.weight"]),
                        vars["_logits._model.0.weight"])
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_logp = np.log(count / num_actions)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp, rtol=0.3)


class TestComputeLogLikelihood(unittest.TestCase):
    def test_dqn(self):
        """Tests, whether DQN correctly computes logp in soft-q mode."""
        test_log_likelihood(dqn.DQNTrainer, dqn.DEFAULT_CONFIG)

    def test_ppo_cont(self):
        """Tests PPO's (cont. actions) compute_log_likelihoods method."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        prev_a = np.array([0.0])
        test_log_likelihood(ppo.PPOTrainer, config, prev_a, continuous=True)

    def test_ppo_discr(self):
        """Tests PPO's (discr. actions) compute_log_likelihoods method."""
        prev_a = np.array(0)
        test_log_likelihood(ppo.PPOTrainer, ppo.DEFAULT_CONFIG, prev_a)

    def test_sac(self):
        """Tests SAC's compute_log_likelihoods method."""
        config = sac.DEFAULT_CONFIG.copy()
        config["policy_model"]["hidden_layer_sizes"] = [10]
        config["policy_model"]["hidden_activation"] = "linear"
        prev_a = np.array([0.0])

        def logp_func(means, log_stds, values, low=-1.0, high=1.0):
            stds = np.exp(
                np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT))
            unsquashed_values = np.arctanh((values - low) /
                                           (high - low) * 2.0 - 1.0)
            log_prob_unsquashed = \
                np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1)
            return log_prob_unsquashed - \
                np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),
                       axis=-1)

        test_log_likelihood(
            sac.SACTrainer,
            config,
            prev_a,
            continuous=True,
            layer_key=("sequential/action", (0, 2)),
            logp_func=logp_func)
[RLlib] Policy.compute_log_likelihoods() and SAC refactor. (issue #7107) (#7124) * Exploration API (+EpsilonGreedy sub-class). * Exploration API (+EpsilonGreedy sub-class). * Cleanup/LINT. * Add `deterministic` to generic Trainer config (NOTE: this is still ignored by most Agents). * Add `error` option to deprecation_warning(). * WIP. * Bug fix: Get exploration-info for tf framework. Bug fix: Properly deprecate some DQN config keys. * WIP. * LINT. * WIP. * Split PerWorkerEpsilonGreedy out of EpsilonGreedy. Docstrings. * Fix bug in sampler.py in case Policy has self.exploration = None * Update rllib/agents/dqn/dqn.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Update rllib/agents/trainer.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * WIP. * Change requests. * LINT * In tune/utils/util.py::deep_update() Only keep deep_updat'ing if both original and value are dicts. If value is not a dict, set * Completely obsolete syn_replay_optimizer.py's parameters schedule_max_timesteps AND beta_annealing_fraction (replaced with prioritized_replay_beta_annealing_timesteps). * Update rllib/evaluation/worker_set.py Co-Authored-By: Eric Liang <ekhliang@gmail.com> * Review fixes. * Fix default value for DQN's exploration spec. * LINT * Fix recursion bug (wrong parent c'tor). * Do not pass timestep to get_exploration_info. * Update tf_policy.py * Fix some remaining issues with test cases and remove more deprecated DQN/APEX exploration configs. * Bug fix tf-action-dist * DDPG incompatibility bug fix with new DQN exploration handling (which is imported by DDPG). * Switch off exploration when getting action probs from off-policy-estimator's policy. * LINT * Fix test_checkpoint_restore.py. * Deprecate all SAC exploration (unused) configs. * Properly use `model.last_output()` everywhere. Instead of `model._last_output`. * WIP. * Take out set_epsilon from multi-agent-env test (not needed, decays anyway). * WIP. * Trigger re-test (flaky checkpoint-restore test). * WIP. * WIP. * Add test case for deterministic action sampling in PPO. * bug fix. * Added deterministic test cases for different Agents. * Fix problem with TupleActions in dynamic-tf-policy. * Separate supported_spaces tests so they can be run separately for easier debugging. * LINT. * Fix autoregressive_action_dist.py test case. * Re-test. * Fix. * Remove duplicate py_test rule from bazel. * LINT. * WIP. * WIP. * SAC fix. * SAC fix. * WIP. * WIP. * WIP. * FIX 2 examples tests. * WIP. * WIP. * WIP. * WIP. * WIP. * Fix. * LINT. * Renamed test file. * WIP. * Add unittest.main. * Make action_dist_class mandatory. * fix * FIX. * WIP. * WIP. * Fix. * Fix. * Fix explorations test case (contextlib cannot find its own nullcontext??). * Force torch to be installed for QMIX. * LINT. * Fix determine_tests_to_run.py. * Fix determine_tests_to_run.py. * WIP * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Add Random exploration component to tests (fixed issue with "static-graph randomness" via py_function). * Rename some stuff. * Rename some stuff. * WIP. * WIP. * Fix SAC. * Fix SAC. * Fix strange tf-error in ray core tests. * Fix strange ray-core tf-error in test_memory_scheduling test case. * Fix test_io.py. * LINT. * Update SAC yaml files' config. Co-authored-by: Eric Liang <ekhliang@gmail.com> 2020-02-22 23:19:49 +01:00			`import numpy as np`
			`from scipy.stats import norm`
			`import unittest`

			`import ray.rllib.agents.dqn as dqn`
			`import ray.rllib.agents.ppo as ppo`
			`import ray.rllib.agents.sac as sac`
			`from ray.rllib.utils.framework import try_import_tf`
			`from ray.rllib.utils.test_utils import check`
			`from ray.rllib.utils.numpy import one_hot, fc, MIN_LOG_NN_OUTPUT, \`
			`MAX_LOG_NN_OUTPUT`

			`tf = try_import_tf()`


			`def test_log_likelihood(run,`
			`config,`
			`prev_a=None,`
			`continuous=False,`
			`layer_key=("fc", (0, 4)),`
			`logp_func=None):`
			`config = config.copy()`
			`# Run locally.`
			`config["num_workers"] = 0`
			`# Env setup.`
			`if continuous:`
			`env = "Pendulum-v0"`
			`obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])`
			`else:`
			`env = "FrozenLake-v0"`
			`config["env_config"] = {"is_slippery": False, "map_name": "4x4"}`
			`obs_batch = np.array([0])`
			`preprocessed_obs_batch = one_hot(obs_batch, depth=16)`

			`# Use Soft-Q for DQNs.`
			`if run is dqn.DQNTrainer:`
			`config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5}`

			`prev_r = None if prev_a is None else np.array(0.0)`

			`# Test against all frameworks.`
			`for fw in ["tf", "eager", "torch"]:`
			`if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch":`
			`continue`
			`print("Testing {} with framework={}".format(run, fw))`
			`config["eager"] = True if fw == "eager" else False`
			`config["use_pytorch"] = True if fw == "torch" else False`

			`trainer = run(config=config, env=env)`
			`policy = trainer.get_policy()`
			`vars = policy.get_weights()`
			`# Sample n actions, then roughly check their logp against their`
			`# counts.`
			`num_actions = 500`
			`actions = []`
			`for _ in range(num_actions):`
			`# Single action from single obs.`
			`actions.append(`
			`trainer.compute_action(`
			`obs_batch[0],`
			`prev_action=prev_a,`
			`prev_reward=prev_r,`
			`explore=True))`

			`# Test 50 actions for their log-likelihoods vs expected values.`
			`if continuous:`
			`for idx in range(50):`
			`a = actions[idx]`
			`if fw == "tf" or fw == "eager":`
			`if isinstance(vars, list):`
			`expected_mean_logstd = fc(`
			`fc(obs_batch, vars[layer_key[1][0]]),`
			`vars[layer_key[1][1]])`
			`else:`
			`expected_mean_logstd = fc(`
			`fc(`
			`obs_batch,`
			`vars["default_policy/{}_1/kernel".format(`
			`layer_key[0])]),`
			`vars["default_policy/{}_out/kernel".format(`
			`layer_key[0])])`
			`else:`
			`expected_mean_logstd = fc(`
			`fc(obs_batch,`
			`vars["_hidden_layers.0._model.0.weight"]),`
			`vars["_logits._model.0.weight"])`
			`mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)`
			`if logp_func is None:`
			`expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))`
			`else:`
			`expected_logp = logp_func(mean, log_std, a)`
			`logp = policy.compute_log_likelihoods(`
			`np.array([a]),`
			`preprocessed_obs_batch,`
			`prev_action_batch=np.array([prev_a]),`
			`prev_reward_batch=np.array([prev_r]))`
			`check(logp, expected_logp[0], rtol=0.2)`
			`# Test all available actions for their logp values.`
			`else:`
			`for a in [0, 1, 2, 3]:`
			`count = actions.count(a)`
			`expected_logp = np.log(count / num_actions)`
			`logp = policy.compute_log_likelihoods(`
			`np.array([a]),`
			`preprocessed_obs_batch,`
			`prev_action_batch=np.array([prev_a]),`
			`prev_reward_batch=np.array([prev_r]))`
			`check(logp, expected_logp, rtol=0.3)`


			`class TestComputeLogLikelihood(unittest.TestCase):`
			`def test_dqn(self):`
			`"""Tests, whether DQN correctly computes logp in soft-q mode."""`
			`test_log_likelihood(dqn.DQNTrainer, dqn.DEFAULT_CONFIG)`

			`def test_ppo_cont(self):`
			`"""Tests PPO's (cont. actions) compute_log_likelihoods method."""`
			`config = ppo.DEFAULT_CONFIG.copy()`
			`config["model"]["fcnet_hiddens"] = [10]`
			`config["model"]["fcnet_activation"] = "linear"`
			`prev_a = np.array([0.0])`
			`test_log_likelihood(ppo.PPOTrainer, config, prev_a, continuous=True)`

			`def test_ppo_discr(self):`
			`"""Tests PPO's (discr. actions) compute_log_likelihoods method."""`
			`prev_a = np.array(0)`
			`test_log_likelihood(ppo.PPOTrainer, ppo.DEFAULT_CONFIG, prev_a)`

			`def test_sac(self):`
			`"""Tests SAC's compute_log_likelihoods method."""`
			`config = sac.DEFAULT_CONFIG.copy()`
			`config["policy_model"]["hidden_layer_sizes"] = [10]`
			`config["policy_model"]["hidden_activation"] = "linear"`
			`prev_a = np.array([0.0])`

			`def logp_func(means, log_stds, values, low=-1.0, high=1.0):`
			`stds = np.exp(`
			`np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT))`
			`unsquashed_values = np.arctanh((values - low) /`
			`(high - low) * 2.0 - 1.0)`
			`log_prob_unsquashed = \`
			`np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1)`
			`return log_prob_unsquashed - \`
			`np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),`
			`axis=-1)`

			`test_log_likelihood(`
			`sac.SACTrainer,`
			`config,`
			`prev_a,`
			`continuous=True,`
			`layer_key=("sequential/action", (0, 2)),`
			`logp_func=logp_func)`