ray/rllib/utils/exploration/tests/test_curiosity.py
Balaji Veeramani 7f1bacc7dc
[CI] Format Python code with Black (#21975)
See #21316 and #21311 for the motivation behind these changes.
2022-01-29 18:41:57 -08:00

286 lines
10 KiB
Python

from collections import deque
import gym
import gym_minigrid
import numpy as np
import sys
import unittest
import ray
from ray import tune
from ray.rllib.agents.callbacks import DefaultCallbacks
import ray.rllib.agents.ppo as ppo
from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator
from ray.rllib.utils.numpy import one_hot
from ray.tune import register_env
class MyCallBack(DefaultCallbacks):
def __init__(self):
super().__init__()
self.deltas = []
def on_postprocess_trajectory(
self,
*,
worker,
episode,
agent_id,
policy_id,
policies,
postprocessed_batch,
original_batches,
**kwargs
):
pos = np.argmax(postprocessed_batch["obs"], -1)
x, y = pos % 8, pos // 8
self.deltas.extend((x ** 2 + y ** 2) ** 0.5)
def on_sample_end(self, *, worker, samples, **kwargs):
print("mean. distance from origin={}".format(np.mean(self.deltas)))
self.deltas = []
class OneHotWrapper(gym.core.ObservationWrapper):
def __init__(self, env, vector_index, framestack):
super().__init__(env)
self.framestack = framestack
# 49=7x7 field of vision; 11=object types; 6=colors; 3=state types.
# +4: Direction.
self.single_frame_dim = 49 * (11 + 6 + 3) + 4
self.init_x = None
self.init_y = None
self.x_positions = []
self.y_positions = []
self.x_y_delta_buffer = deque(maxlen=100)
self.vector_index = vector_index
self.frame_buffer = deque(maxlen=self.framestack)
for _ in range(self.framestack):
self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
self.observation_space = gym.spaces.Box(
0.0, 1.0, shape=(self.single_frame_dim * self.framestack,), dtype=np.float32
)
def observation(self, obs):
# Debug output: max-x/y positions to watch exploration progress.
if self.step_count == 0:
for _ in range(self.framestack):
self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
if self.vector_index == 0:
if self.x_positions:
max_diff = max(
np.sqrt(
(np.array(self.x_positions) - self.init_x) ** 2
+ (np.array(self.y_positions) - self.init_y) ** 2
)
)
self.x_y_delta_buffer.append(max_diff)
print(
"100-average dist travelled={}".format(
np.mean(self.x_y_delta_buffer)
)
)
self.x_positions = []
self.y_positions = []
self.init_x = self.agent_pos[0]
self.init_y = self.agent_pos[1]
# Are we carrying the key?
# if self.carrying is not None:
# print("Carrying KEY!!")
self.x_positions.append(self.agent_pos[0])
self.y_positions.append(self.agent_pos[1])
# One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
objects = one_hot(obs[:, :, 0], depth=11)
colors = one_hot(obs[:, :, 1], depth=6)
states = one_hot(obs[:, :, 2], depth=3)
# Is the door we see open?
# for x in range(7):
# for y in range(7):
# if objects[x, y, 4] == 1.0 and states[x, y, 0] == 1.0:
# print("Door OPEN!!")
all_ = np.concatenate([objects, colors, states], -1)
all_flat = np.reshape(all_, (-1,))
direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32)
single_frame = np.concatenate([all_flat, direction])
self.frame_buffer.append(single_frame)
return np.concatenate(self.frame_buffer)
def env_maker(config):
name = config.get("name", "MiniGrid-Empty-5x5-v0")
framestack = config.get("framestack", 4)
env = gym.make(name)
# Only use image portion of observation (discard goal and direction).
env = gym_minigrid.wrappers.ImgObsWrapper(env)
env = OneHotWrapper(
env,
config.vector_index if hasattr(config, "vector_index") else 0,
framestack=framestack,
)
return env
register_env("mini-grid", env_maker)
CONV_FILTERS = [[16, [11, 11], 3], [32, [9, 9], 3], [64, [5, 5], 3]]
class TestCuriosity(unittest.TestCase):
@classmethod
def setUpClass(cls):
ray.init(num_cpus=3)
@classmethod
def tearDownClass(cls):
ray.shutdown()
def test_curiosity_on_frozen_lake(self):
config = ppo.DEFAULT_CONFIG.copy()
# A very large frozen-lake that's hard for a random policy to solve
# due to 0.0 feedback.
config["env"] = "FrozenLake-v1"
config["env_config"] = {
"desc": [
"SFFFFFFF",
"FFFFFFFF",
"FFFFFFFF",
"FFFFFFFF",
"FFFFFFFF",
"FFFFFFFF",
"FFFFFFFF",
"FFFFFFFG",
],
"is_slippery": False,
}
# Print out observations to see how far we already get inside the Env.
config["callbacks"] = MyCallBack
# Limit horizon to make it really hard for non-curious agent to reach
# the goal state.
config["horizon"] = 16
# Local only.
config["num_workers"] = 0
config["lr"] = 0.001
num_iterations = 10
for _ in framework_iterator(config, frameworks=("tf", "torch")):
# W/ Curiosity. Expect to learn something.
config["exploration_config"] = {
"type": "Curiosity",
"eta": 0.2,
"lr": 0.001,
"feature_dim": 128,
"feature_net_config": {
"fcnet_hiddens": [],
"fcnet_activation": "relu",
},
"sub_exploration": {
"type": "StochasticSampling",
},
}
trainer = ppo.PPOTrainer(config=config)
learnt = False
for i in range(num_iterations):
result = trainer.train()
print(result)
if result["episode_reward_max"] > 0.0:
print("Reached goal after {} iters!".format(i))
learnt = True
break
trainer.stop()
self.assertTrue(learnt)
# Disable this check for now. Add too much flakyness to test.
# if fw == "tf":
# # W/o Curiosity. Expect to learn nothing.
# print("Trying w/o curiosity (not expected to learn).")
# config["exploration_config"] = {
# "type": "StochasticSampling",
# }
# trainer = ppo.PPOTrainer(config=config)
# rewards_wo = 0.0
# for _ in range(num_iterations):
# result = trainer.train()
# rewards_wo += result["episode_reward_mean"]
# print(result)
# trainer.stop()
# self.assertTrue(rewards_wo == 0.0)
# print("Did not reach goal w/o curiosity!")
def test_curiosity_on_partially_observable_domain(self):
config = ppo.DEFAULT_CONFIG.copy()
config["env"] = "mini-grid"
config["env_config"] = {
# Also works with:
# - MiniGrid-MultiRoom-N4-S5-v0
# - MiniGrid-MultiRoom-N2-S4-v0
"name": "MiniGrid-Empty-8x8-v0",
"framestack": 1, # seems to work even w/o framestacking
}
config["horizon"] = 15 # Make it impossible to reach goal by chance.
config["num_envs_per_worker"] = 4
config["model"]["fcnet_hiddens"] = [256, 256]
config["model"]["fcnet_activation"] = "relu"
config["num_sgd_iter"] = 8
config["num_workers"] = 0
config["exploration_config"] = {
"type": "Curiosity",
# For the feature NN, use a non-LSTM fcnet (same as the one
# in the policy model).
"eta": 0.1,
"lr": 0.0003, # 0.0003 or 0.0005 seem to work fine as well.
"feature_dim": 64,
# No actual feature net: map directly from observations to feature
# vector (linearly).
"feature_net_config": {
"fcnet_hiddens": [],
"fcnet_activation": "relu",
},
"sub_exploration": {
"type": "StochasticSampling",
},
}
min_reward = 0.001
stop = {
"training_iteration": 25,
"episode_reward_mean": min_reward,
}
for _ in framework_iterator(config, frameworks="torch"):
# To replay:
# trainer = ppo.PPOTrainer(config=config)
# trainer.restore("[checkpoint file]")
# env = env_maker(config["env_config"])
# s = env.reset()
# for _ in range(10000):
# s, r, d, _ = env.step(trainer.compute_single_action(s))
# if d:
# s = env.reset()
# env.render()
results = tune.run("PPO", config=config, stop=stop, verbose=1)
check_learning_achieved(results, min_reward)
iters = results.trials[0].last_result["training_iteration"]
print("Reached in {} iterations.".format(iters))
# config_wo = config.copy()
# config_wo["exploration_config"] = {"type": "StochasticSampling"}
# stop_wo = stop.copy()
# stop_wo["training_iteration"] = iters
# results = tune.run(
# "PPO", config=config_wo, stop=stop_wo, verbose=1)
# try:
# check_learning_achieved(results, min_reward)
# except ValueError:
# print("Did not learn w/o curiosity (expected).")
# else:
# raise ValueError("Learnt w/o curiosity (not expected)!")
if __name__ == "__main__":
import pytest
sys.exit(pytest.main(["-v", __file__]))