mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
Updated pettingzoo env to acomidate api changes and fixes (#11873)
* Updated pettingzoo env to acomidate api changes and fixes * fixed test failure * fixed linting issue * fixed test failure
This commit is contained in:
parent
a9cf0141a0
commit
1999266bba
5 changed files with 89 additions and 167 deletions
|
@ -217,11 +217,11 @@ PettingZoo Multi-Agent Environments
|
|||
|
||||
from ray.tune.registry import register_env
|
||||
# import the pettingzoo environment
|
||||
from pettingzoo.butterfly import prison_v1
|
||||
from pettingzoo.butterfly import prison_v2
|
||||
# import rllib pettingzoo interface
|
||||
from ray.rllib.env import PettingZooEnv
|
||||
# define how to make the environment. This way takes an optional environment config, num_floors
|
||||
env_creator = lambda config: prison_v1.env(num_floors=config.get("num_floors", 4))
|
||||
env_creator = lambda config: prison_v2.env(num_floors=config.get("num_floors", 4))
|
||||
# register that way to make the environment under an rllib name
|
||||
register_env('prison', lambda config: PettingZooEnv(env_creator(config)))
|
||||
# now you can use `prison` as an environment
|
||||
|
|
|
@ -56,7 +56,7 @@ mypy
|
|||
networkx
|
||||
numba
|
||||
openpyxl
|
||||
pettingzoo>=1.3.2
|
||||
pettingzoo>=1.4.0
|
||||
Pillow; platform_system != "Windows"
|
||||
pygments
|
||||
pytest==5.4.3
|
||||
|
|
236
rllib/env/pettingzoo_env.py
vendored
236
rllib/env/pettingzoo_env.py
vendored
|
@ -10,70 +10,70 @@ class PettingZooEnv(MultiAgentEnv):
|
|||
(actor-environment-cycle) game from the PettingZoo project via the
|
||||
MultiAgentEnv public API.
|
||||
|
||||
It reduces the class of AEC games to Partially Observable Markov (POM)
|
||||
games by imposing the following important restrictions onto an AEC
|
||||
environment:
|
||||
Note that the wrapper has some important limitations:
|
||||
|
||||
1. Each agent steps in order specified in agents list (unless they are
|
||||
done, in which case, they should be skipped).
|
||||
2. Agents act simultaneously (-> No hard-turn games like chess).
|
||||
3. All agents have the same action_spaces and observation_spaces.
|
||||
1. All agents have the same action_spaces and observation_spaces.
|
||||
Note: If, within your aec game, agents do not have homogeneous action /
|
||||
observation spaces, apply SuperSuit wrappers
|
||||
to apply padding functionality: https://github.com/PettingZoo-Team/
|
||||
SuperSuit#built-in-multi-agent-only-functions
|
||||
4. Environments are positive sum games (-> Agents are expected to cooperate
|
||||
2. Environments are positive sum games (-> Agents are expected to cooperate
|
||||
to maximize reward). This isn't a hard restriction, it just that
|
||||
standard algorithms aren't expected to work well in highly competitive
|
||||
games.
|
||||
|
||||
Examples:
|
||||
>>> from pettingzoo.gamma import prison_v0
|
||||
>>> env = POMGameEnv(env_creator=prison_v0})
|
||||
>>> from pettingzoo.butterfly import prison_v2
|
||||
>>> env = PettingZooEnv(prison_v2.env())
|
||||
>>> obs = env.reset()
|
||||
>>> print(obs)
|
||||
{
|
||||
"0": [110, 119],
|
||||
"1": [105, 102],
|
||||
"2": [99, 95],
|
||||
}
|
||||
>>> obs, rewards, dones, infos = env.step(
|
||||
action_dict={
|
||||
"0": 1, "1": 0, "2": 2,
|
||||
})
|
||||
# only returns the observation for the agent which should be stepping
|
||||
{
|
||||
'prisoner_0': array([[[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
...,
|
||||
[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0]]], dtype=uint8)
|
||||
}
|
||||
>>> obs, rewards, dones, infos = env.step({
|
||||
... "prisoner_0": 1
|
||||
... })
|
||||
# only returns the observation, reward, info, etc, for
|
||||
# the agent who's turn is next.
|
||||
>>> print(obs)
|
||||
{
|
||||
'prisoner_1': array([[[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
...,
|
||||
[0, 0, 0],
|
||||
[0, 0, 0],
|
||||
[0, 0, 0]]], dtype=uint8)
|
||||
}
|
||||
>>> print(rewards)
|
||||
{
|
||||
"0": 0,
|
||||
"1": 1,
|
||||
"2": 0,
|
||||
}
|
||||
{
|
||||
'prisoner_1': 0
|
||||
}
|
||||
>>> print(dones)
|
||||
{
|
||||
"0": False, # agent 0 is still running
|
||||
"1": True, # agent 1 is done
|
||||
"__all__": False, # the env is not done
|
||||
}
|
||||
{
|
||||
'prisoner_1': False, '__all__': False
|
||||
}
|
||||
>>> print(infos)
|
||||
{
|
||||
"0": {}, # info for agent 0
|
||||
"1": {}, # info for agent 1
|
||||
}
|
||||
{
|
||||
'prisoner_1': {'map_tuple': (1, 0)}
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, env):
|
||||
"""
|
||||
Parameters:
|
||||
-----------
|
||||
env: AECenv object.
|
||||
"""
|
||||
self.aec_env = env
|
||||
|
||||
self.env = env
|
||||
# agent idx list
|
||||
self.agents = self.aec_env.agents
|
||||
self.agents = self.env.possible_agents
|
||||
|
||||
# Get dictionaries of obs_spaces and act_spaces
|
||||
self.observation_spaces = self.aec_env.observation_spaces
|
||||
self.action_spaces = self.aec_env.action_spaces
|
||||
self.observation_spaces = self.env.observation_spaces
|
||||
self.action_spaces = self.env.action_spaces
|
||||
|
||||
# Get first observation space, assuming all agents have equal space
|
||||
self.observation_space = self.observation_spaces[self.agents[0]]
|
||||
|
@ -83,135 +83,64 @@ class PettingZooEnv(MultiAgentEnv):
|
|||
|
||||
assert all(obs_space == self.observation_space
|
||||
for obs_space
|
||||
in self.aec_env.observation_spaces.values()), \
|
||||
in self.env.observation_spaces.values()), \
|
||||
"Observation spaces for all agents must be identical. Perhaps " \
|
||||
"SuperSuit's pad_observations wrapper can help (useage: " \
|
||||
"`supersuit.aec_wrappers.pad_observations(env)`"
|
||||
|
||||
assert all(act_space == self.action_space
|
||||
for act_space in self.aec_env.action_spaces.values()), \
|
||||
for act_space in self.env.action_spaces.values()), \
|
||||
"Action spaces for all agents must be identical. Perhaps " \
|
||||
"SuperSuit's pad_action_space wrapper can help (useage: " \
|
||||
"`supersuit.aec_wrappers.pad_action_space(env)`"
|
||||
|
||||
self.rewards = {}
|
||||
self.dones = {}
|
||||
self.obs = {}
|
||||
self.infos = {}
|
||||
|
||||
_ = self.reset()
|
||||
|
||||
def _init_dicts(self):
|
||||
# initialize with zero
|
||||
self.rewards = dict(zip(self.agents, [0 for _ in self.agents]))
|
||||
# initialize with False
|
||||
self.dones = dict(zip(self.agents, [False for _ in self.agents]))
|
||||
self.dones["__all__"] = False
|
||||
|
||||
# initialize with None info object
|
||||
self.infos = dict(zip(self.agents, [{} for _ in self.agents]))
|
||||
|
||||
# initialize empty observations
|
||||
self.obs = dict(zip(self.agents, [None for _ in self.agents]))
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
Resets the env and returns observations from ready agents.
|
||||
self.env.reset()
|
||||
return {
|
||||
self.env.agent_selection: self.env.observe(
|
||||
self.env.agent_selection)
|
||||
}
|
||||
|
||||
Returns:
|
||||
obs (dict): New observations for each ready agent.
|
||||
"""
|
||||
# 1. Reset environment; agent pointer points to first agent.
|
||||
self.aec_env.reset()
|
||||
def step(self, action):
|
||||
self.env.step(action[self.env.agent_selection])
|
||||
obs_d = {}
|
||||
rew_d = {}
|
||||
done_d = {}
|
||||
info_d = {}
|
||||
while self.env.agents:
|
||||
obs, rew, done, info = self.env.last()
|
||||
a = self.env.agent_selection
|
||||
obs_d[a] = obs
|
||||
rew_d[a] = rew
|
||||
done_d[a] = done
|
||||
info_d[a] = info
|
||||
if self.env.dones[self.env.agent_selection]:
|
||||
self.env.step(None)
|
||||
else:
|
||||
break
|
||||
|
||||
# 2. Copy agents from environment
|
||||
self.agents = self.aec_env.agents
|
||||
all_done = not self.env.agents
|
||||
done_d["__all__"] = all_done
|
||||
|
||||
# 3. Reset dictionaries
|
||||
self._init_dicts()
|
||||
|
||||
# 4. Get initial observations
|
||||
for agent in self.agents:
|
||||
|
||||
# For each agent get initial observations
|
||||
self.obs[agent] = self.aec_env.observe(agent)
|
||||
|
||||
return self.obs
|
||||
|
||||
def step(self, action_dict):
|
||||
"""
|
||||
Executes input actions from RL agents and returns observations from
|
||||
environment agents.
|
||||
|
||||
The returns are dicts mapping from agent_id strings to values. The
|
||||
number of agents in the env can vary over time.
|
||||
|
||||
Returns
|
||||
-------
|
||||
obs (dict): New observations for each ready agent.
|
||||
rewards (dict): Reward values for each ready agent. If the
|
||||
episode is just started, the value will be None.
|
||||
dones (dict): Done values for each ready agent. The special key
|
||||
"__all__" (required) is used to indicate env termination.
|
||||
infos (dict): Optional info values for each agent id.
|
||||
"""
|
||||
stepped_agents = set()
|
||||
while (self.aec_env.agent_selection not in stepped_agents
|
||||
and self.aec_env.dones[self.aec_env.agent_selection]):
|
||||
agent = self.aec_env.agent_selection
|
||||
self.aec_env.step(None)
|
||||
stepped_agents.add(agent)
|
||||
stepped_agents = set()
|
||||
# print(action_dict)
|
||||
while (self.aec_env.agent_selection not in stepped_agents):
|
||||
agent = self.aec_env.agent_selection
|
||||
assert agent in action_dict or self.aec_env.dones[agent], \
|
||||
"Live environment agent is not in actions dictionary"
|
||||
self.aec_env.step(action_dict[agent])
|
||||
stepped_agents.add(agent)
|
||||
# print(self.aec_env.dones)
|
||||
# print(stepped_agents)
|
||||
assert all(agent in stepped_agents or self.aec_env.dones[agent]
|
||||
for agent in action_dict), \
|
||||
"environment has a nontrivial ordering, and cannot be used with"\
|
||||
" the POMGameEnv wrapper"
|
||||
|
||||
self.obs = {}
|
||||
self.rewards = {}
|
||||
self.dones = {}
|
||||
self.infos = {}
|
||||
|
||||
# update self.agents
|
||||
self.agents = list(action_dict.keys())
|
||||
|
||||
for agent in self.agents:
|
||||
self.obs[agent] = self.aec_env.observe(agent)
|
||||
self.dones[agent] = self.aec_env.dones[agent]
|
||||
self.rewards[agent] = self.aec_env.rewards[agent]
|
||||
self.infos[agent] = self.aec_env.infos[agent]
|
||||
|
||||
self.dones["__all__"] = all(self.aec_env.dones.values())
|
||||
|
||||
return self.obs, self.rewards, self.dones, self.infos
|
||||
|
||||
def render(self, mode="human"):
|
||||
return self.aec_env.render(mode=mode)
|
||||
return obs_d, rew_d, done_d, info_d
|
||||
|
||||
def close(self):
|
||||
self.aec_env.close()
|
||||
self.env.close()
|
||||
|
||||
def seed(self, seed=None):
|
||||
self.aec_env.seed(seed)
|
||||
self.env.seed(seed)
|
||||
|
||||
def with_agent_groups(self, groups, obs_space=None, act_space=None):
|
||||
raise NotImplementedError
|
||||
def render(self, mode="human"):
|
||||
return self.env.render(mode)
|
||||
|
||||
|
||||
class ParallelPettingZooEnv(MultiAgentEnv):
|
||||
def __init__(self, env):
|
||||
self.par_env = env
|
||||
# agent idx list
|
||||
self.agents = self.par_env.agents
|
||||
self.agents = self.par_env.possible_agents
|
||||
|
||||
# Get dictionaries of obs_spaces and act_spaces
|
||||
self.observation_spaces = self.par_env.observation_spaces
|
||||
|
@ -242,17 +171,8 @@ class ParallelPettingZooEnv(MultiAgentEnv):
|
|||
return self.par_env.reset()
|
||||
|
||||
def step(self, action_dict):
|
||||
aobs, arew, adones, ainfo = self.par_env.step(action_dict)
|
||||
obss = {}
|
||||
rews = {}
|
||||
dones = {}
|
||||
infos = {}
|
||||
for agent in action_dict:
|
||||
obss[agent] = aobs[agent]
|
||||
rews[agent] = arew[agent]
|
||||
dones[agent] = adones[agent]
|
||||
infos[agent] = ainfo[agent]
|
||||
dones["__all__"] = all(adones.values())
|
||||
obss, rews, dones, infos = self.par_env.step(action_dict)
|
||||
dones["__all__"] = all(dones.values())
|
||||
return obss, rews, dones, infos
|
||||
|
||||
def close(self):
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
from copy import deepcopy
|
||||
from numpy import float32
|
||||
import os
|
||||
from pettingzoo.butterfly import pistonball_v0
|
||||
from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.registry import get_agent_class
|
||||
from ray.rllib.env import PettingZooEnv
|
||||
from pettingzoo.butterfly import pistonball_v1
|
||||
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -22,7 +23,7 @@ if __name__ == "__main__":
|
|||
|
||||
# function that outputs the environment you wish to register.
|
||||
def env_creator(config):
|
||||
env = pistonball_v0.env(local_ratio=config.get("local_ratio", 0.2))
|
||||
env = pistonball_v1.env(local_ratio=config.get("local_ratio", 0.2))
|
||||
env = dtype_v0(env, dtype=float32)
|
||||
env = color_reduction_v0(env, mode="R")
|
||||
env = normalize_obs_v0(env)
|
||||
|
|
|
@ -6,7 +6,7 @@ from ray.tune.registry import register_env
|
|||
from ray.rllib.env import PettingZooEnv
|
||||
from ray.rllib.agents.registry import get_agent_class
|
||||
|
||||
from pettingzoo.mpe import simple_spread_v1
|
||||
from pettingzoo.mpe import simple_spread_v2
|
||||
|
||||
|
||||
class TestPettingZooEnv(unittest.TestCase):
|
||||
|
@ -17,13 +17,14 @@ class TestPettingZooEnv(unittest.TestCase):
|
|||
ray.shutdown()
|
||||
|
||||
def test_pettingzoo_env(self):
|
||||
register_env("prison", lambda _: PettingZooEnv(simple_spread_v1.env()))
|
||||
register_env("simple_spread",
|
||||
lambda _: PettingZooEnv(simple_spread_v2.env()))
|
||||
|
||||
agent_class = get_agent_class("PPO")
|
||||
|
||||
config = deepcopy(agent_class._default_config)
|
||||
|
||||
test_env = PettingZooEnv(simple_spread_v1.env())
|
||||
test_env = PettingZooEnv(simple_spread_v2.env())
|
||||
obs_space = test_env.observation_space
|
||||
act_space = test_env.action_space
|
||||
test_env.close()
|
||||
|
@ -43,7 +44,7 @@ class TestPettingZooEnv(unittest.TestCase):
|
|||
config["horizon"] = 200 # After n steps, force reset simulation
|
||||
config["no_done_at_end"] = False
|
||||
|
||||
agent = agent_class(env="prison", config=config)
|
||||
agent = agent_class(env="simple_spread", config=config)
|
||||
agent.train()
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue