ray/rllib/utils/exploration/tests/test_curiosity.py

from collections import deque
import gym
import gym_minigrid
import numpy as np
import sys
import unittest

import ray
from ray import tune
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray.rllib.agents.ppo as ppo
from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator
from ray.rllib.utils.numpy import one_hot
from ray.tune import register_env


class MyCallBack(DefaultCallbacks):
    def __init__(self):
        super().__init__()
        self.deltas = []

    def on_postprocess_trajectory(
        self,
        *,
        worker,
        episode,
        agent_id,
        policy_id,
        policies,
        postprocessed_batch,
        original_batches,
        **kwargs
    ):
        pos = np.argmax(postprocessed_batch["obs"], -1)
        x, y = pos % 8, pos // 8
        self.deltas.extend((x ** 2 + y ** 2) ** 0.5)

    def on_sample_end(self, *, worker, samples, **kwargs):
        print("mean. distance from origin={}".format(np.mean(self.deltas)))
        self.deltas = []


class OneHotWrapper(gym.core.ObservationWrapper):
    def __init__(self, env, vector_index, framestack):
        super().__init__(env)
        self.framestack = framestack
        # 49=7x7 field of vision; 11=object types; 6=colors; 3=state types.
        # +4: Direction.
        self.single_frame_dim = 49 * (11 + 6 + 3) + 4
        self.init_x = None
        self.init_y = None
        self.x_positions = []
        self.y_positions = []
        self.x_y_delta_buffer = deque(maxlen=100)
        self.vector_index = vector_index
        self.frame_buffer = deque(maxlen=self.framestack)
        for _ in range(self.framestack):
            self.frame_buffer.append(np.zeros((self.single_frame_dim,)))

        self.observation_space = gym.spaces.Box(
            0.0, 1.0, shape=(self.single_frame_dim * self.framestack,), dtype=np.float32
        )

    def observation(self, obs):
        # Debug output: max-x/y positions to watch exploration progress.
        if self.step_count == 0:
            for _ in range(self.framestack):
                self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
            if self.vector_index == 0:
                if self.x_positions:
                    max_diff = max(
                        np.sqrt(
                            (np.array(self.x_positions) - self.init_x) ** 2
                            + (np.array(self.y_positions) - self.init_y) ** 2
                        )
                    )
                    self.x_y_delta_buffer.append(max_diff)
                    print(
                        "100-average dist travelled={}".format(
                            np.mean(self.x_y_delta_buffer)
                        )
                    )
                    self.x_positions = []
                    self.y_positions = []
                self.init_x = self.agent_pos[0]
                self.init_y = self.agent_pos[1]

        # Are we carrying the key?
        # if self.carrying is not None:
        #    print("Carrying KEY!!")

        self.x_positions.append(self.agent_pos[0])
        self.y_positions.append(self.agent_pos[1])

        # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
        objects = one_hot(obs[:, :, 0], depth=11)
        colors = one_hot(obs[:, :, 1], depth=6)
        states = one_hot(obs[:, :, 2], depth=3)
        # Is the door we see open?
        # for x in range(7):
        #    for y in range(7):
        #        if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0:
        #            print("Door OPEN!!")

        all_ = np.concatenate([objects, colors, states], -1)
        all_flat = np.reshape(all_, (-1,))
        direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32)
        single_frame = np.concatenate([all_flat, direction])
        self.frame_buffer.append(single_frame)
        return np.concatenate(self.frame_buffer)


def env_maker(config):
    name = config.get("name", "MiniGrid-Empty-5x5-v0")
    framestack = config.get("framestack", 4)
    env = gym.make(name)
    # Only use image portion of observation (discard goal and direction).
    env = gym_minigrid.wrappers.ImgObsWrapper(env)
    env = OneHotWrapper(
        env,
        config.vector_index if hasattr(config, "vector_index") else 0,
        framestack=framestack,
    )
    return env


register_env("mini-grid", env_maker)
CONV_FILTERS = [[16, [11, 11], 3], [32, [9, 9], 3], [64, [5, 5], 3]]


class TestCuriosity(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        ray.init(num_cpus=3)

    @classmethod
    def tearDownClass(cls):
        ray.shutdown()

    def test_curiosity_on_frozen_lake(self):
        config = ppo.DEFAULT_CONFIG.copy()
        # A very large frozen-lake that's hard for a random policy to solve
        # due to 0.0 feedback.
        config["env"] = "FrozenLake-v1"
        config["env_config"] = {
            "desc": [
                "SFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFF",
                "FFFFFFFG",
            ],
            "is_slippery": False,
        }
        # Print out observations to see how far we already get inside the Env.
        config["callbacks"] = MyCallBack
        # Limit horizon to make it really hard for non-curious agent to reach
        # the goal state.
        config["horizon"] = 16
        # Local only.
        config["num_workers"] = 0
        config["lr"] = 0.001

        num_iterations = 10
        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            # W/ Curiosity. Expect to learn something.
            config["exploration_config"] = {
                "type": "Curiosity",
                "eta": 0.2,
                "lr": 0.001,
                "feature_dim": 128,
                "feature_net_config": {
                    "fcnet_hiddens": [],
                    "fcnet_activation": "relu",
                },
                "sub_exploration": {
                    "type": "StochasticSampling",
                },
            }
            trainer = ppo.PPOTrainer(config=config)
            learnt = False
            for i in range(num_iterations):
                result = trainer.train()
                print(result)
                if result["episode_reward_max"] > 0.0:
                    print("Reached goal after {} iters!".format(i))
                    learnt = True
                    break
            trainer.stop()
            self.assertTrue(learnt)

            # Disable this check for now. Add too much flakyness to test.
            # if fw == "tf":
            #    # W/o Curiosity. Expect to learn nothing.
            #    print("Trying w/o curiosity (not expected to learn).")
            #    config["exploration_config"] = {
            #        "type": "StochasticSampling",
            #    }
            #    trainer = ppo.PPOTrainer(config=config)
            #    rewards_wo = 0.0
            #    for _ in range(num_iterations):
            #        result = trainer.train()
            #        rewards_wo += result["episode_reward_mean"]
            #        print(result)
            #    trainer.stop()
            #    self.assertTrue(rewards_wo == 0.0)
            #    print("Did not reach goal w/o curiosity!")

    def test_curiosity_on_partially_observable_domain(self):
        config = ppo.DEFAULT_CONFIG.copy()
        config["env"] = "mini-grid"
        config["env_config"] = {
            # Also works with:
            # - MiniGrid-MultiRoom-N4-S5-v0
            # - MiniGrid-MultiRoom-N2-S4-v0
            "name": "MiniGrid-Empty-8x8-v0",
            "framestack": 1,  # seems to work even w/o framestacking
        }
        config["horizon"] = 15  # Make it impossible to reach goal by chance.
        config["num_envs_per_worker"] = 4
        config["model"]["fcnet_hiddens"] = [256, 256]
        config["model"]["fcnet_activation"] = "relu"
        config["num_sgd_iter"] = 8
        config["num_workers"] = 0

        config["exploration_config"] = {
            "type": "Curiosity",
            # For the feature NN, use a non-LSTM fcnet (same as the one
            # in the policy model).
            "eta": 0.1,
            "lr": 0.0003,  # 0.0003 or 0.0005 seem to work fine as well.
            "feature_dim": 64,
            # No actual feature net: map directly from observations to feature
            # vector (linearly).
            "feature_net_config": {
                "fcnet_hiddens": [],
                "fcnet_activation": "relu",
            },
            "sub_exploration": {
                "type": "StochasticSampling",
            },
        }

        min_reward = 0.001
        stop = {
            "training_iteration": 25,
            "episode_reward_mean": min_reward,
        }
        for _ in framework_iterator(config, frameworks="torch"):
            # To replay:
            # trainer = ppo.PPOTrainer(config=config)
            # trainer.restore("[checkpoint file]")
            # env = env_maker(config["env_config"])
            # s = env.reset()
            # for _ in range(10000):
            #     s, r, d, _ = env.step(trainer.compute_single_action(s))
            #     if d:
            #         s = env.reset()
            #     env.render()

            results = tune.run("PPO", config=config, stop=stop, verbose=1)
            check_learning_achieved(results, min_reward)
            iters = results.trials[0].last_result["training_iteration"]
            print("Reached in {} iterations.".format(iters))

            # config_wo = config.copy()
            # config_wo["exploration_config"] = {"type": "StochasticSampling"}
            # stop_wo = stop.copy()
            # stop_wo["training_iteration"] = iters
            # results = tune.run(
            #     "PPO", config=config_wo, stop=stop_wo, verbose=1)
            # try:
            #     check_learning_achieved(results, min_reward)
            # except ValueError:
            #     print("Did not learn w/o curiosity (expected).")
            # else:
            #     raise ValueError("Learnt w/o curiosity (not expected)!")


if __name__ == "__main__":
    import pytest

    sys.exit(pytest.main(["-v", __file__]))
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`from collections import deque`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`import gym`
			`import gym_minigrid`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00			`import numpy as np`
			`import sys`
			`import unittest`

[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`import ray`
			`from ray import tune`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`from ray.rllib.agents.callbacks import DefaultCallbacks`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00			`import ray.rllib.agents.ppo as ppo`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`from ray.rllib.utils.numpy import one_hot`
			`from ray.tune import register_env`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00

[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`class MyCallBack(DefaultCallbacks):`
			`def __init__(self):`
			`super().__init__()`
			`self.deltas = []`

[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`def on_postprocess_trajectory(`
			`self,`
			`*,`
			`worker,`
			`episode,`
			`agent_id,`
			`policy_id,`
			`policies,`
			`postprocessed_batch,`
			`original_batches,`
			`**kwargs`
			`):`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`pos = np.argmax(postprocessed_batch["obs"], -1)`
[RLlib] Curiosity exploration module: tf/tf2.x/tf-eager support. (#11945) 2020-11-29 12:31:24 +01:00			`x, y = pos % 8, pos // 8`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.deltas.extend((x 2 + y 2) ** 0.5)`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00
			`def on_sample_end(self, , worker, samples, *kwargs):`
			`print("mean. distance from origin={}".format(np.mean(self.deltas)))`
			`self.deltas = []`


[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`class OneHotWrapper(gym.core.ObservationWrapper):`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`def __init__(self, env, vector_index, framestack):`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`super().__init__(env)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`self.framestack = framestack`
			`# 49=7x7 field of vision; 11=object types; 6=colors; 3=state types.`
			`# +4: Direction.`
			`self.single_frame_dim = 49 * (11 + 6 + 3) + 4`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`self.init_x = None`
			`self.init_y = None`
			`self.x_positions = []`
			`self.y_positions = []`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`self.x_y_delta_buffer = deque(maxlen=100)`
			`self.vector_index = vector_index`
			`self.frame_buffer = deque(maxlen=self.framestack)`
			`for _ in range(self.framestack):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.frame_buffer.append(np.zeros((self.single_frame_dim,)))`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00
			`self.observation_space = gym.spaces.Box(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`0.0, 1.0, shape=(self.single_frame_dim * self.framestack,), dtype=np.float32`
			`)`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
			`def observation(self, obs):`
			`# Debug output: max-x/y positions to watch exploration progress.`
			`if self.step_count == 0:`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`for _ in range(self.framestack):`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`self.frame_buffer.append(np.zeros((self.single_frame_dim,)))`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`if self.vector_index == 0:`
			`if self.x_positions:`
			`max_diff = max(`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`np.sqrt(`
			`(np.array(self.x_positions) - self.init_x) ** 2`
			`+ (np.array(self.y_positions) - self.init_y) ** 2`
			`)`
			`)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`self.x_y_delta_buffer.append(max_diff)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`print(`
			`"100-average dist travelled={}".format(`
			`np.mean(self.x_y_delta_buffer)`
			`)`
			`)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`self.x_positions = []`
			`self.y_positions = []`
			`self.init_x = self.agent_pos[0]`
			`self.init_y = self.agent_pos[1]`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
			`# Are we carrying the key?`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`# if self.carrying is not None:`
			`# print("Carrying KEY!!")`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
			`self.x_positions.append(self.agent_pos[0])`
			`self.y_positions.append(self.agent_pos[1])`

			`# One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.`
			`objects = one_hot(obs[:, :, 0], depth=11)`
			`colors = one_hot(obs[:, :, 1], depth=6)`
			`states = one_hot(obs[:, :, 2], depth=3)`
			`# Is the door we see open?`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`# for x in range(7):`
			`# for y in range(7):`
			`# if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0:`
			`# print("Door OPEN!!")`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
			`all_ = np.concatenate([objects, colors, states], -1)`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`all_flat = np.reshape(all_, (-1,))`
			`direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`single_frame = np.concatenate([all_flat, direction])`
			`self.frame_buffer.append(single_frame)`
			`return np.concatenate(self.frame_buffer)`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`def env_maker(config):`
			`name = config.get("name", "MiniGrid-Empty-5x5-v0")`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`framestack = config.get("framestack", 4)`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`env = gym.make(name)`
			`# Only use image portion of observation (discard goal and direction).`
			`env = gym_minigrid.wrappers.ImgObsWrapper(env)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`env = OneHotWrapper(`
			`env,`
			`config.vector_index if hasattr(config, "vector_index") else 0,`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`framestack=framestack,`
			`)`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`return env`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
			`register_env("mini-grid", env_maker)`
			`CONV_FILTERS = [[16, [11, 11], 3], [32, [9, 9], 3], [64, [5, 5], 3]]`


			`class TestCuriosity(unittest.TestCase):`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00			`@classmethod`
			`def setUpClass(cls):`
[RLlib] Trajectory view API: Enable by default for PPO, IMPALA, PG, A3C (tf and torch). (#11747) 2020-11-12 16:27:34 +01:00			`ray.init(num_cpus=3)`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00
			`@classmethod`
			`def tearDownClass(cls):`
			`ray.shutdown()`

[RLlib] Curiosity exploration module: tf/tf2.x/tf-eager support. (#11945) 2020-11-29 12:31:24 +01:00			`def test_curiosity_on_frozen_lake(self):`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`config = ppo.DEFAULT_CONFIG.copy()`
[RLlib] Curiosity exploration module: tf/tf2.x/tf-eager support. (#11945) 2020-11-29 12:31:24 +01:00			`# A very large frozen-lake that's hard for a random policy to solve`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`# due to 0.0 feedback.`
[RLlib] Upgrade gym version to 0.21 and deprecate pendulum-v0. (#19535) * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 * Reformatting * Fixing tests * Move atari-py install conditional to req.txt * migrate to new ale install method * Fix QMix, SAC, and MADDPA too. * Unpin gym and deprecate pendulum v0 Many tests in rllib depended on pendulum v0, however in gym 0.21, pendulum v0 was deprecated in favor of pendulum v1. This may change reward thresholds, so will have to potentially rerun all of the pendulum v1 benchmarks, or use another environment in favor. The same applies to frozen lake v0 and frozen lake v1 Lastly, all of the RLlib tests and have been moved to python 3.7 * Add gym installation based on python version. Pin python<= 3.6 to gym 0.19 due to install issues with atari roms in gym 0.20 Move atari-py install conditional to req.txt migrate to new ale install method Make parametric_actions_cartpole return float32 actions/obs Adding type conversions if obs/actions don't match space Add utils to make elements match gym space dtypes Co-authored-by: Jun Gong <jungong@anyscale.com> Co-authored-by: sven1977 <svenmika1977@gmail.com> 2021-11-03 08:24:00 -07:00			`config["env"] = "FrozenLake-v1"`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`config["env_config"] = {`
			`"desc": [`
[RLlib] Curiosity exploration module: tf/tf2.x/tf-eager support. (#11945) 2020-11-29 12:31:24 +01:00			`"SFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFF",`
			`"FFFFFFFG",`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`],`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`"is_slippery": False,`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00			`}`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`# Print out observations to see how far we already get inside the Env.`
			`config["callbacks"] = MyCallBack`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`# Limit horizon to make it really hard for non-curious agent to reach`
			`# the goal state.`
[RLlib] Curiosity exploration module: tf/tf2.x/tf-eager support. (#11945) 2020-11-29 12:31:24 +01:00			`config["horizon"] = 16`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`# Local only.`
			`config["num_workers"] = 0`
			`config["lr"] = 0.001`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`num_iterations = 10`
[RLlib] De-flake 3 test cases; Fix `config.simple_optimizer` and `SampleBatch.is_training` warnings. (#17321) 2021-07-27 14:39:06 -04:00			`for _ in framework_iterator(config, frameworks=("tf", "torch")):`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`# W/ Curiosity. Expect to learn something.`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`config["exploration_config"] = {`
			`"type": "Curiosity",`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`"eta": 0.2,`
			`"lr": 0.001,`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`"feature_dim": 128,`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`"feature_net_config": {`
			`"fcnet_hiddens": [],`
			`"fcnet_activation": "relu",`
			`},`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`"sub_exploration": {`
			`"type": "StochasticSampling",`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`},`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`}`
			`trainer = ppo.PPOTrainer(config=config)`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`learnt = False`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`for i in range(num_iterations):`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`result = trainer.train()`
			`print(result)`
[RLlib] Do not create env on driver iff num_workers > 0. (#11307) 2020-10-15 18:21:30 +02:00			`if result["episode_reward_max"] > 0.0:`
			`print("Reached goal after {} iters!".format(i))`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`learnt = True`
			`break`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`trainer.stop()`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`self.assertTrue(learnt)`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
[RLlib] Fix missing "info_batch" arg (None) in `compute_actions` calls. (#13237) 2021-01-07 21:25:02 +01:00			`# Disable this check for now. Add too much flakyness to test.`
			`# if fw == "tf":`
			`# # W/o Curiosity. Expect to learn nothing.`
			`# print("Trying w/o curiosity (not expected to learn).")`
			`# config["exploration_config"] = {`
			`# "type": "StochasticSampling",`
			`# }`
			`# trainer = ppo.PPOTrainer(config=config)`
			`# rewards_wo = 0.0`
			`# for _ in range(num_iterations):`
			`# result = trainer.train()`
			`# rewards_wo += result["episode_reward_mean"]`
			`# print(result)`
			`# trainer.stop()`
			`# self.assertTrue(rewards_wo == 0.0)`
			`# print("Did not reach goal w/o curiosity!")`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00
			`def test_curiosity_on_partially_observable_domain(self):`
			`config = ppo.DEFAULT_CONFIG.copy()`
			`config["env"] = "mini-grid"`
			`config["env_config"] = {`
			`# Also works with:`
			`# - MiniGrid-MultiRoom-N4-S5-v0`
			`# - MiniGrid-MultiRoom-N2-S4-v0`
			`"name": "MiniGrid-Empty-8x8-v0",`
			`"framestack": 1, # seems to work even w/o framestacking`
			`}`
			`config["horizon"] = 15 # Make it impossible to reach goal by chance.`
			`config["num_envs_per_worker"] = 4`
			`config["model"]["fcnet_hiddens"] = [256, 256]`
			`config["model"]["fcnet_activation"] = "relu"`
			`config["num_sgd_iter"] = 8`
			`config["num_workers"] = 0`

			`config["exploration_config"] = {`
			`"type": "Curiosity",`
			`# For the feature NN, use a non-LSTM fcnet (same as the one`
			`# in the policy model).`
			`"eta": 0.1,`
			`"lr": 0.0003, # 0.0003 or 0.0005 seem to work fine as well.`
			`"feature_dim": 64,`
			`# No actual feature net: map directly from observations to feature`
			`# vector (linearly).`
			`"feature_net_config": {`
			`"fcnet_hiddens": [],`
			`"fcnet_activation": "relu",`
			`},`
			`"sub_exploration": {`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00			`"type": "StochasticSampling",`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00			`},`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`}`

			`min_reward = 0.001`
			`stop = {`
			`"training_iteration": 25,`
			`"episode_reward_mean": min_reward,`
			`}`
			`for _ in framework_iterator(config, frameworks="torch"):`
			`# To replay:`
			`# trainer = ppo.PPOTrainer(config=config)`
			`# trainer.restore("[checkpoint file]")`
			`# env = env_maker(config["env_config"])`
			`# s = env.reset()`
			`# for _ in range(10000):`
[RLlib] CQL BC loss fixes; PPO/PG/A2\|3C action normalization fixes (#16531) 2021-06-30 12:32:11 +02:00			`# s, r, d, _ = env.step(trainer.compute_single_action(s))`
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`# if d:`
			`# s = env.reset()`
			`# env.render()`

			`results = tune.run("PPO", config=config, stop=stop, verbose=1)`
			`check_learning_achieved(results, min_reward)`
			`iters = results.trials[0].last_result["training_iteration"]`
			`print("Reached in {} iterations.".format(iters))`
[RLlib] Curiosity minor fixes, do-overs, and testing. (#10143) 2020-08-19 17:49:50 +02:00
[RLlib] Curiosity enhancements. (#10373) 2020-09-05 13:14:24 +02:00			`# config_wo = config.copy()`
			`# config_wo["exploration_config"] = {"type": "StochasticSampling"}`
			`# stop_wo = stop.copy()`
			`# stop_wo["training_iteration"] = iters`
			`# results = tune.run(`
			`# "PPO", config=config_wo, stop=stop_wo, verbose=1)`
			`# try:`
			`# check_learning_achieved(results, min_reward)`
			`# except ValueError:`
			`# print("Did not learn w/o curiosity (expected).")`
			`# else:`
			`# raise ValueError("Learnt w/o curiosity (not expected)!")`
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00

			`if __name__ == "__main__":`
			`import pytest`
[CI] Format Python code with Black (#21975) See #21316 and #21311 for the motivation behind these changes. 2022-01-29 18:41:57 -08:00
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912) 2020-08-13 14:14:16 -04:00			`sys.exit(pytest.main(["-v", __file__]))`