ray/rllib/evaluation/tests/test_trajectory_view_api.py

import copy
from gym.spaces import Box, Discrete
import time
import unittest

import ray
import ray.rllib.agents.ppo as ppo
from ray.rllib.examples.env.debug_counter_env import MultiAgentDebugCounterEnv
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.examples.policy.episode_env_aware_policy import \
    EpisodeEnvAwarePolicy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.test_utils import framework_iterator


class TestTrajectoryViewAPI(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        ray.init()

    @classmethod
    def tearDownClass(cls) -> None:
        ray.shutdown()

    def test_traj_view_normal_case(self):
        """Tests, whether Model and Policy return the correct ViewRequirements.
        """
        config = ppo.DEFAULT_CONFIG.copy()
        for _ in framework_iterator(config, frameworks="torch"):
            trainer = ppo.PPOTrainer(config, env="CartPole-v0")
            policy = trainer.get_policy()
            view_req_model = policy.model.inference_view_requirements
            view_req_policy = policy.training_view_requirements
            assert len(view_req_model) == 1
            assert len(view_req_policy) == 10
            for key in [
                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
                    SampleBatch.VF_PREDS, "advantages", "value_targets",
                    SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP
            ]:
                assert key in view_req_policy
                # None of the view cols has a special underlying data_col,
                # except next-obs.
                if key != SampleBatch.NEXT_OBS:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            trainer.stop()

    def test_traj_view_lstm_prev_actions_and_rewards(self):
        """Tests, whether Policy/Model return correct LSTM ViewRequirements.
        """
        config = ppo.DEFAULT_CONFIG.copy()
        config["model"] = config["model"].copy()
        # Activate LSTM + prev-action + rewards.
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action_reward"] = True

        for _ in framework_iterator(config, frameworks="torch"):
            trainer = ppo.PPOTrainer(config, env="CartPole-v0")
            policy = trainer.get_policy()
            view_req_model = policy.model.inference_view_requirements
            view_req_policy = policy.training_view_requirements
            assert len(view_req_model) == 7  # obs, prev_a, prev_r, 4xstates
            assert len(view_req_policy) == 16
            for key in [
                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
                    SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS,
                    SampleBatch.PREV_REWARDS, "advantages", "value_targets",
                    SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP
            ]:
                assert key in view_req_policy

                if key == SampleBatch.PREV_ACTIONS:
                    assert view_req_policy[key].data_col == SampleBatch.ACTIONS
                    assert view_req_policy[key].shift == -1
                elif key == SampleBatch.PREV_REWARDS:
                    assert view_req_policy[key].data_col == SampleBatch.REWARDS
                    assert view_req_policy[key].shift == -1
                elif key not in [
                        SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS,
                        SampleBatch.PREV_REWARDS
                ]:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            trainer.stop()

    def test_traj_view_lstm_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv

        from ray.tune import register_env
        register_env("ma_env", lambda c: RandomMultiAgentEnv({
            "num_agents": 2,
            "p_done": 0.01,
            "action_space": action_space,
            "observation_space": obs_space
        }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 6
        config["model"]["use_lstm"] = True
        config["model"]["lstm_use_prev_action_reward"] = True
        config["model"]["max_seq_len"] = 100

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 1
        # Only works in torch so far.
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API (and time-major)")
            config["_use_trajectory_view_api"] = True
            config["model"]["_time_major"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                sampler_perf_ = out["sampler_perf"]
                sampler_perf = {
                    k: sampler_perf.get(k, 0.0) + sampler_perf_[k]
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / 1000
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf = {
                k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf, learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API (and w/o time-major)")
            config["_use_trajectory_view_api"] = False
            config["model"]["_time_major"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                sampler_perf_ = out["sampler_perf"]
                sampler_perf = {
                    k: sampler_perf.get(k, 0.0) + sampler_perf_[k]
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / 1000
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf = {
                k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is much faster.
            self.assertLess(duration_w, duration_wo)
            self.assertLess(learn_time_w, learn_time_wo * 0.6)

    def test_traj_view_lstm_functionality(self):
        action_space = Box(-float("inf"), float("inf"), shape=(2, ))
        obs_space = Box(float("-inf"), float("inf"), (4, ))
        max_seq_len = 50
        policies = {
            "pol0": (EpisodeEnvAwarePolicy, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        rollout_worker = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config={
                "multiagent": {
                    "policies": policies,
                    "policy_mapping_fn": policy_fn,
                },
                "_use_trajectory_view_api": True,
                "model": {
                    "use_lstm": True,
                    "_time_major": True,
                    "max_seq_len": max_seq_len,
                },
            },
            policy=policies,
            policy_mapping_fn=policy_fn,
            num_envs=1,
        )
        for i in range(100):
            pc = rollout_worker.sampler.sample_collector. \
                policy_sample_collectors["pol0"]
            sample_batch_offset_before = pc.sample_batch_offset
            buffers = pc.buffers
            result = rollout_worker.sample()
            pol_batch = result.policy_batches["pol0"]

            self.assertTrue(result.count == 100)
            self.assertTrue(pol_batch.count >= 100)
            self.assertFalse(0 in pol_batch.seq_lens)
            # Check prev_reward/action, next_obs consistency.
            for t in range(max_seq_len):
                obs_t = pol_batch["obs"][t]
                r_t = pol_batch["rewards"][t]
                if t > 0:
                    next_obs_t_m_1 = pol_batch["new_obs"][t - 1]
                    self.assertTrue((obs_t == next_obs_t_m_1).all())
                if t < max_seq_len - 1:
                    prev_rewards_t_p_1 = pol_batch["prev_rewards"][t + 1]
                    self.assertTrue((r_t == prev_rewards_t_p_1).all())

            # Check the sanity of all the buffers in the un underlying
            # PerPolicy collector.
            for sample_batch_slot, agent_slot in enumerate(
                    range(sample_batch_offset_before, pc.sample_batch_offset)):
                t_buf = buffers["t"][:, agent_slot]
                obs_buf = buffers["obs"][:, agent_slot]
                # Skip empty seqs at end (these won't be part of the batch
                # and have been copied to new agent-slots (even if seq-len=0)).
                if sample_batch_slot < len(pol_batch.seq_lens):
                    seq_len = pol_batch.seq_lens[sample_batch_slot]
                    # Make sure timesteps are always increasing within the seq.
                    assert all(t_buf[1] + j == n + 1
                               for j, n in enumerate(t_buf)
                               if j < seq_len and j != 0)
                    # Make sure all obs within seq are non-0.0.
                    assert all(
                        any(obs_buf[j] != 0.0) for j in range(1, seq_len + 1))

            # Check seq-lens.
            for agent_slot, seq_len in enumerate(pol_batch.seq_lens):
                if seq_len < max_seq_len - 1:
                    # At least in the beginning, the next slots should always
                    # be empty (once all agent slots have been used once, these
                    # may be filled with "old" values (from longer sequences)).
                    if i < 10:
                        self.assertTrue(
                            (pol_batch["obs"][seq_len +
                                              1][agent_slot] == 0.0).all())
                    print(end="")
                    self.assertFalse(
                        (pol_batch["obs"][seq_len][agent_slot] == 0.0).all())


if __name__ == "__main__":
    import pytest
    import sys
    sys.exit(pytest.main(["-v", __file__]))
[RLlib] Trajectory view API - 03 Fast LSTM + prev actions/rewards (#9950) 2020-08-21 12:35:16 +02:00			`import copy`
			`from gym.spaces import Box, Discrete`
			`import time`
			`import unittest`

			`import ray`
			`import ray.rllib.agents.ppo as ppo`
			`from ray.rllib.examples.env.debug_counter_env import MultiAgentDebugCounterEnv`
			`from ray.rllib.evaluation.rollout_worker import RolloutWorker`
			`from ray.rllib.examples.policy.episode_env_aware_policy import \`
			`EpisodeEnvAwarePolicy`
			`from ray.rllib.policy.sample_batch import SampleBatch`
			`from ray.rllib.utils.test_utils import framework_iterator`


			`class TestTrajectoryViewAPI(unittest.TestCase):`
			`@classmethod`
			`def setUpClass(cls) -> None:`
			`ray.init()`

			`@classmethod`
			`def tearDownClass(cls) -> None:`
			`ray.shutdown()`

			`def test_traj_view_normal_case(self):`
			`"""Tests, whether Model and Policy return the correct ViewRequirements.`
			`"""`
			`config = ppo.DEFAULT_CONFIG.copy()`
			`for _ in framework_iterator(config, frameworks="torch"):`
			`trainer = ppo.PPOTrainer(config, env="CartPole-v0")`
			`policy = trainer.get_policy()`
			`view_req_model = policy.model.inference_view_requirements`
			`view_req_policy = policy.training_view_requirements`
			`assert len(view_req_model) == 1`
			`assert len(view_req_policy) == 10`
			`for key in [`
			`SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,`
			`SampleBatch.DONES, SampleBatch.NEXT_OBS,`
			`SampleBatch.VF_PREDS, "advantages", "value_targets",`
			`SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP`
			`]:`
			`assert key in view_req_policy`
			`# None of the view cols has a special underlying data_col,`
			`# except next-obs.`
			`if key != SampleBatch.NEXT_OBS:`
			`assert view_req_policy[key].data_col is None`
			`else:`
			`assert view_req_policy[key].data_col == SampleBatch.OBS`
			`assert view_req_policy[key].shift == 1`
			`trainer.stop()`

			`def test_traj_view_lstm_prev_actions_and_rewards(self):`
			`"""Tests, whether Policy/Model return correct LSTM ViewRequirements.`
			`"""`
			`config = ppo.DEFAULT_CONFIG.copy()`
			`config["model"] = config["model"].copy()`
			`# Activate LSTM + prev-action + rewards.`
			`config["model"]["use_lstm"] = True`
			`config["model"]["lstm_use_prev_action_reward"] = True`

			`for _ in framework_iterator(config, frameworks="torch"):`
			`trainer = ppo.PPOTrainer(config, env="CartPole-v0")`
			`policy = trainer.get_policy()`
			`view_req_model = policy.model.inference_view_requirements`
			`view_req_policy = policy.training_view_requirements`
			`assert len(view_req_model) == 7 # obs, prev_a, prev_r, 4xstates`
			`assert len(view_req_policy) == 16`
			`for key in [`
			`SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,`
			`SampleBatch.DONES, SampleBatch.NEXT_OBS,`
			`SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS,`
			`SampleBatch.PREV_REWARDS, "advantages", "value_targets",`
			`SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP`
			`]:`
			`assert key in view_req_policy`

			`if key == SampleBatch.PREV_ACTIONS:`
			`assert view_req_policy[key].data_col == SampleBatch.ACTIONS`
			`assert view_req_policy[key].shift == -1`
			`elif key == SampleBatch.PREV_REWARDS:`
			`assert view_req_policy[key].data_col == SampleBatch.REWARDS`
			`assert view_req_policy[key].shift == -1`
			`elif key not in [`
			`SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS,`
			`SampleBatch.PREV_REWARDS`
			`]:`
			`assert view_req_policy[key].data_col is None`
			`else:`
			`assert view_req_policy[key].data_col == SampleBatch.OBS`
			`assert view_req_policy[key].shift == 1`
			`trainer.stop()`

			`def test_traj_view_lstm_performance(self):`
			"""Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
			`"""`
			`config = copy.deepcopy(ppo.DEFAULT_CONFIG)`
			`action_space = Discrete(2)`
			`obs_space = Box(-1.0, 1.0, shape=(700, ))`

			`from ray.rllib.examples.env.random_env import RandomMultiAgentEnv`

			`from ray.tune import register_env`
			`register_env("ma_env", lambda c: RandomMultiAgentEnv({`
			`"num_agents": 2,`
			`"p_done": 0.01,`
			`"action_space": action_space,`
			`"observation_space": obs_space`
			`}))`

			`config["num_workers"] = 3`
			`config["num_envs_per_worker"] = 8`
			`config["num_sgd_iter"] = 6`
			`config["model"]["use_lstm"] = True`
			`config["model"]["lstm_use_prev_action_reward"] = True`
			`config["model"]["max_seq_len"] = 100`

			`policies = {`
			`"pol0": (None, obs_space, action_space, {}),`
			`}`

			`def policy_fn(agent_id):`
			`return "pol0"`

			`config["multiagent"] = {`
			`"policies": policies,`
			`"policy_mapping_fn": policy_fn,`
			`}`
			`num_iterations = 1`
			`# Only works in torch so far.`
			`for _ in framework_iterator(config, frameworks="torch"):`
			`print("w/ traj. view API (and time-major)")`
			`config["_use_trajectory_view_api"] = True`
			`config["model"]["_time_major"] = True`
			`trainer = ppo.PPOTrainer(config=config, env="ma_env")`
			`learn_time_w = 0.0`
			`sampler_perf = {}`
			`start = time.time()`
			`for i in range(num_iterations):`
			`out = trainer.train()`
			`sampler_perf_ = out["sampler_perf"]`
			`sampler_perf = {`
			`k: sampler_perf.get(k, 0.0) + sampler_perf_[k]`
			`for k, v in sampler_perf_.items()`
			`}`
			`delta = out["timers"]["learn_time_ms"] / 1000`
			`learn_time_w += delta`
			`print("{}={}s".format(i, delta))`
			`sampler_perf = {`
			`k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)`
			`for k, v in sampler_perf.items()`
			`}`
			`duration_w = time.time() - start`
			`print("Duration: {}s "`
			`"sampler-perf.={} learn-time/iter={}s".format(`
			`duration_w, sampler_perf, learn_time_w / num_iterations))`
			`trainer.stop()`

			`print("w/o traj. view API (and w/o time-major)")`
			`config["_use_trajectory_view_api"] = False`
			`config["model"]["_time_major"] = False`
			`trainer = ppo.PPOTrainer(config=config, env="ma_env")`
			`learn_time_wo = 0.0`
			`sampler_perf = {}`
			`start = time.time()`
			`for i in range(num_iterations):`
			`out = trainer.train()`
			`sampler_perf_ = out["sampler_perf"]`
			`sampler_perf = {`
			`k: sampler_perf.get(k, 0.0) + sampler_perf_[k]`
			`for k, v in sampler_perf_.items()`
			`}`
			`delta = out["timers"]["learn_time_ms"] / 1000`
			`learn_time_wo += delta`
			`print("{}={}s".format(i, delta))`
			`sampler_perf = {`
			`k: sampler_perf[k] / (num_iterations if "mean_" in k else 1)`
			`for k, v in sampler_perf.items()`
			`}`
			`duration_wo = time.time() - start`
			`print("Duration: {}s "`
			`"sampler-perf.={} learn-time/iter={}s".format(`
			`duration_wo, sampler_perf,`
			`learn_time_wo / num_iterations))`
			`trainer.stop()`

			# Assert `_use_trajectory_view_api` is much faster.
			`self.assertLess(duration_w, duration_wo)`
			`self.assertLess(learn_time_w, learn_time_wo * 0.6)`

			`def test_traj_view_lstm_functionality(self):`
			`action_space = Box(-float("inf"), float("inf"), shape=(2, ))`
			`obs_space = Box(float("-inf"), float("inf"), (4, ))`
			`max_seq_len = 50`
			`policies = {`
			`"pol0": (EpisodeEnvAwarePolicy, obs_space, action_space, {}),`
			`}`

			`def policy_fn(agent_id):`
			`return "pol0"`

			`rollout_worker = RolloutWorker(`
			`env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),`
			`policy_config={`
			`"multiagent": {`
			`"policies": policies,`
			`"policy_mapping_fn": policy_fn,`
			`},`
			`"_use_trajectory_view_api": True,`
			`"model": {`
			`"use_lstm": True,`
			`"_time_major": True,`
			`"max_seq_len": max_seq_len,`
			`},`
			`},`
			`policy=policies,`
			`policy_mapping_fn=policy_fn,`
			`num_envs=1,`
			`)`
			`for i in range(100):`
			`pc = rollout_worker.sampler.sample_collector. \`
			`policy_sample_collectors["pol0"]`
			`sample_batch_offset_before = pc.sample_batch_offset`
			`buffers = pc.buffers`
			`result = rollout_worker.sample()`
			`pol_batch = result.policy_batches["pol0"]`

			`self.assertTrue(result.count == 100)`
			`self.assertTrue(pol_batch.count >= 100)`
			`self.assertFalse(0 in pol_batch.seq_lens)`
			`# Check prev_reward/action, next_obs consistency.`
			`for t in range(max_seq_len):`
			`obs_t = pol_batch["obs"][t]`
			`r_t = pol_batch["rewards"][t]`
			`if t > 0:`
			`next_obs_t_m_1 = pol_batch["new_obs"][t - 1]`
			`self.assertTrue((obs_t == next_obs_t_m_1).all())`
			`if t < max_seq_len - 1:`
			`prev_rewards_t_p_1 = pol_batch["prev_rewards"][t + 1]`
			`self.assertTrue((r_t == prev_rewards_t_p_1).all())`

			`# Check the sanity of all the buffers in the un underlying`
			`# PerPolicy collector.`
			`for sample_batch_slot, agent_slot in enumerate(`
			`range(sample_batch_offset_before, pc.sample_batch_offset)):`
			`t_buf = buffers["t"][:, agent_slot]`
			`obs_buf = buffers["obs"][:, agent_slot]`
			`# Skip empty seqs at end (these won't be part of the batch`
			`# and have been copied to new agent-slots (even if seq-len=0)).`
			`if sample_batch_slot < len(pol_batch.seq_lens):`
			`seq_len = pol_batch.seq_lens[sample_batch_slot]`
			`# Make sure timesteps are always increasing within the seq.`
			`assert all(t_buf[1] + j == n + 1`
			`for j, n in enumerate(t_buf)`
			`if j < seq_len and j != 0)`
			`# Make sure all obs within seq are non-0.0.`
			`assert all(`
			`any(obs_buf[j] != 0.0) for j in range(1, seq_len + 1))`

			`# Check seq-lens.`
			`for agent_slot, seq_len in enumerate(pol_batch.seq_lens):`
			`if seq_len < max_seq_len - 1:`
			`# At least in the beginning, the next slots should always`
			`# be empty (once all agent slots have been used once, these`
			`# may be filled with "old" values (from longer sequences)).`
			`if i < 10:`
			`self.assertTrue(`
			`(pol_batch["obs"][seq_len +`
			`1][agent_slot] == 0.0).all())`
			`print(end="")`
			`self.assertFalse(`
			`(pol_batch["obs"][seq_len][agent_slot] == 0.0).all())`


			`if __name__ == "__main__":`
			`import pytest`
			`import sys`
			`sys.exit(pytest.main(["-v", __file__]))`