Updated pettingzoo env to acomidate api changes and fixes (#11873)

* Updated pettingzoo env to acomidate api changes and fixes

* fixed test failure

* fixed linting issue

* fixed test failure
This commit is contained in:
Benjamin Black 2020-11-09 19:09:49 -05:00 committed by GitHub
parent a9cf0141a0
commit 1999266bba
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 89 additions and 167 deletions

View file

@ -217,11 +217,11 @@ PettingZoo Multi-Agent Environments
from ray.tune.registry import register_env
# import the pettingzoo environment
from pettingzoo.butterfly import prison_v1
from pettingzoo.butterfly import prison_v2
# import rllib pettingzoo interface
from ray.rllib.env import PettingZooEnv
# define how to make the environment. This way takes an optional environment config, num_floors
env_creator = lambda config: prison_v1.env(num_floors=config.get("num_floors", 4))
env_creator = lambda config: prison_v2.env(num_floors=config.get("num_floors", 4))
# register that way to make the environment under an rllib name
register_env('prison', lambda config: PettingZooEnv(env_creator(config)))
# now you can use `prison` as an environment

View file

@ -56,7 +56,7 @@ mypy
networkx
numba
openpyxl
pettingzoo>=1.3.2
pettingzoo>=1.4.0
Pillow; platform_system != "Windows"
pygments
pytest==5.4.3

View file

@ -10,70 +10,70 @@ class PettingZooEnv(MultiAgentEnv):
(actor-environment-cycle) game from the PettingZoo project via the
MultiAgentEnv public API.
It reduces the class of AEC games to Partially Observable Markov (POM)
games by imposing the following important restrictions onto an AEC
environment:
Note that the wrapper has some important limitations:
1. Each agent steps in order specified in agents list (unless they are
done, in which case, they should be skipped).
2. Agents act simultaneously (-> No hard-turn games like chess).
3. All agents have the same action_spaces and observation_spaces.
1. All agents have the same action_spaces and observation_spaces.
Note: If, within your aec game, agents do not have homogeneous action /
observation spaces, apply SuperSuit wrappers
to apply padding functionality: https://github.com/PettingZoo-Team/
SuperSuit#built-in-multi-agent-only-functions
4. Environments are positive sum games (-> Agents are expected to cooperate
2. Environments are positive sum games (-> Agents are expected to cooperate
to maximize reward). This isn't a hard restriction, it just that
standard algorithms aren't expected to work well in highly competitive
games.
Examples:
>>> from pettingzoo.gamma import prison_v0
>>> env = POMGameEnv(env_creator=prison_v0})
>>> from pettingzoo.butterfly import prison_v2
>>> env = PettingZooEnv(prison_v2.env())
>>> obs = env.reset()
>>> print(obs)
{
"0": [110, 119],
"1": [105, 102],
"2": [99, 95],
}
>>> obs, rewards, dones, infos = env.step(
action_dict={
"0": 1, "1": 0, "2": 2,
})
# only returns the observation for the agent which should be stepping
{
'prisoner_0': array([[[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
...,
[0, 0, 0],
[0, 0, 0],
[0, 0, 0]]], dtype=uint8)
}
>>> obs, rewards, dones, infos = env.step({
... "prisoner_0": 1
... })
# only returns the observation, reward, info, etc, for
# the agent who's turn is next.
>>> print(obs)
{
'prisoner_1': array([[[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
...,
[0, 0, 0],
[0, 0, 0],
[0, 0, 0]]], dtype=uint8)
}
>>> print(rewards)
{
"0": 0,
"1": 1,
"2": 0,
}
{
'prisoner_1': 0
}
>>> print(dones)
{
"0": False, # agent 0 is still running
"1": True, # agent 1 is done
"__all__": False, # the env is not done
}
{
'prisoner_1': False, '__all__': False
}
>>> print(infos)
{
"0": {}, # info for agent 0
"1": {}, # info for agent 1
}
{
'prisoner_1': {'map_tuple': (1, 0)}
}
"""
def __init__(self, env):
"""
Parameters:
-----------
env: AECenv object.
"""
self.aec_env = env
self.env = env
# agent idx list
self.agents = self.aec_env.agents
self.agents = self.env.possible_agents
# Get dictionaries of obs_spaces and act_spaces
self.observation_spaces = self.aec_env.observation_spaces
self.action_spaces = self.aec_env.action_spaces
self.observation_spaces = self.env.observation_spaces
self.action_spaces = self.env.action_spaces
# Get first observation space, assuming all agents have equal space
self.observation_space = self.observation_spaces[self.agents[0]]
@ -83,135 +83,64 @@ class PettingZooEnv(MultiAgentEnv):
assert all(obs_space == self.observation_space
for obs_space
in self.aec_env.observation_spaces.values()), \
in self.env.observation_spaces.values()), \
"Observation spaces for all agents must be identical. Perhaps " \
"SuperSuit's pad_observations wrapper can help (useage: " \
"`supersuit.aec_wrappers.pad_observations(env)`"
assert all(act_space == self.action_space
for act_space in self.aec_env.action_spaces.values()), \
for act_space in self.env.action_spaces.values()), \
"Action spaces for all agents must be identical. Perhaps " \
"SuperSuit's pad_action_space wrapper can help (useage: " \
"`supersuit.aec_wrappers.pad_action_space(env)`"
self.rewards = {}
self.dones = {}
self.obs = {}
self.infos = {}
_ = self.reset()
def _init_dicts(self):
# initialize with zero
self.rewards = dict(zip(self.agents, [0 for _ in self.agents]))
# initialize with False
self.dones = dict(zip(self.agents, [False for _ in self.agents]))
self.dones["__all__"] = False
# initialize with None info object
self.infos = dict(zip(self.agents, [{} for _ in self.agents]))
# initialize empty observations
self.obs = dict(zip(self.agents, [None for _ in self.agents]))
self.reset()
def reset(self):
"""
Resets the env and returns observations from ready agents.
self.env.reset()
return {
self.env.agent_selection: self.env.observe(
self.env.agent_selection)
}
Returns:
obs (dict): New observations for each ready agent.
"""
# 1. Reset environment; agent pointer points to first agent.
self.aec_env.reset()
def step(self, action):
self.env.step(action[self.env.agent_selection])
obs_d = {}
rew_d = {}
done_d = {}
info_d = {}
while self.env.agents:
obs, rew, done, info = self.env.last()
a = self.env.agent_selection
obs_d[a] = obs
rew_d[a] = rew
done_d[a] = done
info_d[a] = info
if self.env.dones[self.env.agent_selection]:
self.env.step(None)
else:
break
# 2. Copy agents from environment
self.agents = self.aec_env.agents
all_done = not self.env.agents
done_d["__all__"] = all_done
# 3. Reset dictionaries
self._init_dicts()
# 4. Get initial observations
for agent in self.agents:
# For each agent get initial observations
self.obs[agent] = self.aec_env.observe(agent)
return self.obs
def step(self, action_dict):
"""
Executes input actions from RL agents and returns observations from
environment agents.
The returns are dicts mapping from agent_id strings to values. The
number of agents in the env can vary over time.
Returns
-------
obs (dict): New observations for each ready agent.
rewards (dict): Reward values for each ready agent. If the
episode is just started, the value will be None.
dones (dict): Done values for each ready agent. The special key
"__all__" (required) is used to indicate env termination.
infos (dict): Optional info values for each agent id.
"""
stepped_agents = set()
while (self.aec_env.agent_selection not in stepped_agents
and self.aec_env.dones[self.aec_env.agent_selection]):
agent = self.aec_env.agent_selection
self.aec_env.step(None)
stepped_agents.add(agent)
stepped_agents = set()
# print(action_dict)
while (self.aec_env.agent_selection not in stepped_agents):
agent = self.aec_env.agent_selection
assert agent in action_dict or self.aec_env.dones[agent], \
"Live environment agent is not in actions dictionary"
self.aec_env.step(action_dict[agent])
stepped_agents.add(agent)
# print(self.aec_env.dones)
# print(stepped_agents)
assert all(agent in stepped_agents or self.aec_env.dones[agent]
for agent in action_dict), \
"environment has a nontrivial ordering, and cannot be used with"\
" the POMGameEnv wrapper"
self.obs = {}
self.rewards = {}
self.dones = {}
self.infos = {}
# update self.agents
self.agents = list(action_dict.keys())
for agent in self.agents:
self.obs[agent] = self.aec_env.observe(agent)
self.dones[agent] = self.aec_env.dones[agent]
self.rewards[agent] = self.aec_env.rewards[agent]
self.infos[agent] = self.aec_env.infos[agent]
self.dones["__all__"] = all(self.aec_env.dones.values())
return self.obs, self.rewards, self.dones, self.infos
def render(self, mode="human"):
return self.aec_env.render(mode=mode)
return obs_d, rew_d, done_d, info_d
def close(self):
self.aec_env.close()
self.env.close()
def seed(self, seed=None):
self.aec_env.seed(seed)
self.env.seed(seed)
def with_agent_groups(self, groups, obs_space=None, act_space=None):
raise NotImplementedError
def render(self, mode="human"):
return self.env.render(mode)
class ParallelPettingZooEnv(MultiAgentEnv):
def __init__(self, env):
self.par_env = env
# agent idx list
self.agents = self.par_env.agents
self.agents = self.par_env.possible_agents
# Get dictionaries of obs_spaces and act_spaces
self.observation_spaces = self.par_env.observation_spaces
@ -242,17 +171,8 @@ class ParallelPettingZooEnv(MultiAgentEnv):
return self.par_env.reset()
def step(self, action_dict):
aobs, arew, adones, ainfo = self.par_env.step(action_dict)
obss = {}
rews = {}
dones = {}
infos = {}
for agent in action_dict:
obss[agent] = aobs[agent]
rews[agent] = arew[agent]
dones[agent] = adones[agent]
infos[agent] = ainfo[agent]
dones["__all__"] = all(adones.values())
obss, rews, dones, infos = self.par_env.step(action_dict)
dones["__all__"] = all(dones.values())
return obss, rews, dones, infos
def close(self):

View file

@ -1,12 +1,13 @@
from copy import deepcopy
from numpy import float32
import os
from pettingzoo.butterfly import pistonball_v0
from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0
import ray
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.env import PettingZooEnv
from pettingzoo.butterfly import pistonball_v1
from ray.tune.registry import register_env
if __name__ == "__main__":
@ -22,7 +23,7 @@ if __name__ == "__main__":
# function that outputs the environment you wish to register.
def env_creator(config):
env = pistonball_v0.env(local_ratio=config.get("local_ratio", 0.2))
env = pistonball_v1.env(local_ratio=config.get("local_ratio", 0.2))
env = dtype_v0(env, dtype=float32)
env = color_reduction_v0(env, mode="R")
env = normalize_obs_v0(env)

View file

@ -6,7 +6,7 @@ from ray.tune.registry import register_env
from ray.rllib.env import PettingZooEnv
from ray.rllib.agents.registry import get_agent_class
from pettingzoo.mpe import simple_spread_v1
from pettingzoo.mpe import simple_spread_v2
class TestPettingZooEnv(unittest.TestCase):
@ -17,13 +17,14 @@ class TestPettingZooEnv(unittest.TestCase):
ray.shutdown()
def test_pettingzoo_env(self):
register_env("prison", lambda _: PettingZooEnv(simple_spread_v1.env()))
register_env("simple_spread",
lambda _: PettingZooEnv(simple_spread_v2.env()))
agent_class = get_agent_class("PPO")
config = deepcopy(agent_class._default_config)
test_env = PettingZooEnv(simple_spread_v1.env())
test_env = PettingZooEnv(simple_spread_v2.env())
obs_space = test_env.observation_space
act_space = test_env.action_space
test_env.close()
@ -43,7 +44,7 @@ class TestPettingZooEnv(unittest.TestCase):
config["horizon"] = 200 # After n steps, force reset simulation
config["no_done_at_end"] = False
agent = agent_class(env="prison", config=config)
agent = agent_class(env="simple_spread", config=config)
agent.train()