Updated pettingzoo env to acomidate api changes and fixes (#11873)

* Updated pettingzoo env to acomidate api changes and fixes * fixed test failure * fixed linting issue * fixed test failure
2025-03-05 10:01:43 -05:00 · 2020-11-09 19:09:49 -05:00 · 2020-11-09 19:09:49 -05:00 · 1999266bba
commit 1999266bba
parent a9cf0141a0
5 changed files with 89 additions and 167 deletions
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@ -217,11 +217,11 @@ PettingZoo Multi-Agent Environments

    from ray.tune.registry import register_env
    # import the pettingzoo environment
-    from pettingzoo.butterfly import prison_v1
+    from pettingzoo.butterfly import prison_v2
    # import rllib pettingzoo interface
    from ray.rllib.env import PettingZooEnv
    # define how to make the environment. This way takes an optional environment config, num_floors
-    env_creator = lambda config: prison_v1.env(num_floors=config.get("num_floors", 4))
+    env_creator = lambda config: prison_v2.env(num_floors=config.get("num_floors", 4))
    # register that way to make the environment under an rllib name
    register_env('prison', lambda config: PettingZooEnv(env_creator(config)))
    # now you can use `prison` as an environment
--- a/python/requirements.txt
+++ b/python/requirements.txt
@ -56,7 +56,7 @@ mypy
 networkx
 numba
 openpyxl
-pettingzoo>=1.3.2
+pettingzoo>=1.4.0
 Pillow; platform_system != "Windows"
 pygments
 pytest==5.4.3
--- a/rllib/env/pettingzoo_env.py
+++ b/rllib/env/pettingzoo_env.py
@ -10,70 +10,70 @@ class PettingZooEnv(MultiAgentEnv):
    (actor-environment-cycle) game from the PettingZoo project via the
    MultiAgentEnv public API.

-    It reduces the class of AEC games to Partially Observable Markov (POM)
-    games by imposing the following important restrictions onto an AEC
-    environment:
+    Note that the wrapper has some important limitations:

-    1. Each agent steps in order specified in agents list (unless they are
-       done, in which case, they should be skipped).
-    2. Agents act simultaneously (-> No hard-turn games like chess).
-    3. All agents have the same action_spaces and observation_spaces.
+    1. All agents have the same action_spaces and observation_spaces.
       Note: If, within your aec game, agents do not have homogeneous action /
       observation spaces, apply SuperSuit wrappers
       to apply padding functionality: https://github.com/PettingZoo-Team/
       SuperSuit#built-in-multi-agent-only-functions
-    4. Environments are positive sum games (-> Agents are expected to cooperate
+    2. Environments are positive sum games (-> Agents are expected to cooperate
       to maximize reward). This isn't a hard restriction, it just that
       standard algorithms aren't expected to work well in highly competitive
       games.

    Examples:
-        >>> from pettingzoo.gamma import prison_v0
-        >>> env = POMGameEnv(env_creator=prison_v0})
+        >>> from pettingzoo.butterfly import prison_v2
+        >>> env = PettingZooEnv(prison_v2.env())
        >>> obs = env.reset()
        >>> print(obs)
-            {
-                "0": [110, 119],
-                "1": [105, 102],
-                "2": [99, 95],
-            }
-        >>> obs, rewards, dones, infos = env.step(
-            action_dict={
-                "0": 1, "1": 0, "2": 2,
-            })
+        # only returns the observation for the agent which should be stepping
+        {
+            'prisoner_0': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
+        >>> obs, rewards, dones, infos = env.step({
+        ...                 "prisoner_0": 1
+        ...             })
+        # only returns the observation, reward, info, etc, for
+        # the agent who's turn is next.
+        >>> print(obs)
+        {
+            'prisoner_1': array([[[0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0],
+                ...,
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 0, 0]]], dtype=uint8)
+        }
        >>> print(rewards)
-            {
-                "0": 0,
-                "1": 1,
-                "2": 0,
-            }
+        {
+            'prisoner_1': 0
+        }
        >>> print(dones)
-            {
-                "0": False,    # agent 0 is still running
-                "1": True,     # agent 1 is done
-                "__all__": False,  # the env is not done
-            }
+        {
+            'prisoner_1': False, '__all__': False
+        }
        >>> print(infos)
-            {
-                "0": {},  # info for agent 0
-                "1": {},  # info for agent 1
-            }
+        {
+            'prisoner_1': {'map_tuple': (1, 0)}
+        }
    """

    def __init__(self, env):
-        """
-        Parameters:
-        -----------
-        env:  AECenv object.
-        """
-        self.aec_env = env
-
+        self.env = env
        # agent idx list
-        self.agents = self.aec_env.agents
+        self.agents = self.env.possible_agents

        # Get dictionaries of obs_spaces and act_spaces
-        self.observation_spaces = self.aec_env.observation_spaces
-        self.action_spaces = self.aec_env.action_spaces
+        self.observation_spaces = self.env.observation_spaces
+        self.action_spaces = self.env.action_spaces

        # Get first observation space, assuming all agents have equal space
        self.observation_space = self.observation_spaces[self.agents[0]]
@ -83,135 +83,64 @@ class PettingZooEnv(MultiAgentEnv):

        assert all(obs_space == self.observation_space
                   for obs_space
-                   in self.aec_env.observation_spaces.values()), \
+                   in self.env.observation_spaces.values()), \
            "Observation spaces for all agents must be identical. Perhaps " \
            "SuperSuit's pad_observations wrapper can help (useage: " \
            "`supersuit.aec_wrappers.pad_observations(env)`"

        assert all(act_space == self.action_space
-                   for act_space in self.aec_env.action_spaces.values()), \
+                   for act_space in self.env.action_spaces.values()), \
            "Action spaces for all agents must be identical. Perhaps " \
            "SuperSuit's pad_action_space wrapper can help (useage: " \
            "`supersuit.aec_wrappers.pad_action_space(env)`"

-        self.rewards = {}
-        self.dones = {}
-        self.obs = {}
-        self.infos = {}
-
-        _ = self.reset()
-
-    def _init_dicts(self):
-        # initialize with zero
-        self.rewards = dict(zip(self.agents, [0 for _ in self.agents]))
-        # initialize with False
-        self.dones = dict(zip(self.agents, [False for _ in self.agents]))
-        self.dones["__all__"] = False
-
-        # initialize with None info object
-        self.infos = dict(zip(self.agents, [{} for _ in self.agents]))
-
-        # initialize empty observations
-        self.obs = dict(zip(self.agents, [None for _ in self.agents]))
+        self.reset()

    def reset(self):
-        """
-        Resets the env and returns observations from ready agents.
+        self.env.reset()
+        return {
+            self.env.agent_selection: self.env.observe(
+                self.env.agent_selection)
+        }

-        Returns:
-            obs (dict): New observations for each ready agent.
-        """
-        # 1. Reset environment; agent pointer points to first agent.
-        self.aec_env.reset()
+    def step(self, action):
+        self.env.step(action[self.env.agent_selection])
+        obs_d = {}
+        rew_d = {}
+        done_d = {}
+        info_d = {}
+        while self.env.agents:
+            obs, rew, done, info = self.env.last()
+            a = self.env.agent_selection
+            obs_d[a] = obs
+            rew_d[a] = rew
+            done_d[a] = done
+            info_d[a] = info
+            if self.env.dones[self.env.agent_selection]:
+                self.env.step(None)
+            else:
+                break

-        # 2. Copy agents from environment
-        self.agents = self.aec_env.agents
+        all_done = not self.env.agents
+        done_d["__all__"] = all_done

-        # 3. Reset dictionaries
-        self._init_dicts()
-
-        # 4. Get initial observations
-        for agent in self.agents:
-
-            # For each agent get initial observations
-            self.obs[agent] = self.aec_env.observe(agent)
-
-        return self.obs
-
-    def step(self, action_dict):
-        """
-        Executes input actions from RL agents and returns observations from
-        environment agents.
-
-        The returns are dicts mapping from agent_id strings to values. The
-        number of agents in the env can vary over time.
-
-        Returns
-        -------
-            obs (dict): New observations for each ready agent.
-            rewards (dict): Reward values for each ready agent. If the
-                episode is just started, the value will be None.
-            dones (dict): Done values for each ready agent. The special key
-                "__all__" (required) is used to indicate env termination.
-            infos (dict): Optional info values for each agent id.
-        """
-        stepped_agents = set()
-        while (self.aec_env.agent_selection not in stepped_agents
-               and self.aec_env.dones[self.aec_env.agent_selection]):
-            agent = self.aec_env.agent_selection
-            self.aec_env.step(None)
-            stepped_agents.add(agent)
-        stepped_agents = set()
-        # print(action_dict)
-        while (self.aec_env.agent_selection not in stepped_agents):
-            agent = self.aec_env.agent_selection
-            assert agent in action_dict or self.aec_env.dones[agent], \
-                "Live environment agent is not in actions dictionary"
-            self.aec_env.step(action_dict[agent])
-            stepped_agents.add(agent)
-        # print(self.aec_env.dones)
-        # print(stepped_agents)
-        assert all(agent in stepped_agents or self.aec_env.dones[agent]
-                   for agent in action_dict), \
-            "environment has a nontrivial ordering, and cannot be used with"\
-            " the POMGameEnv wrapper"
-
-        self.obs = {}
-        self.rewards = {}
-        self.dones = {}
-        self.infos = {}
-
-        # update self.agents
-        self.agents = list(action_dict.keys())
-
-        for agent in self.agents:
-            self.obs[agent] = self.aec_env.observe(agent)
-            self.dones[agent] = self.aec_env.dones[agent]
-            self.rewards[agent] = self.aec_env.rewards[agent]
-            self.infos[agent] = self.aec_env.infos[agent]
-
-        self.dones["__all__"] = all(self.aec_env.dones.values())
-
-        return self.obs, self.rewards, self.dones, self.infos
-
-    def render(self, mode="human"):
-        return self.aec_env.render(mode=mode)
+        return obs_d, rew_d, done_d, info_d

    def close(self):
-        self.aec_env.close()
+        self.env.close()

    def seed(self, seed=None):
-        self.aec_env.seed(seed)
+        self.env.seed(seed)

-    def with_agent_groups(self, groups, obs_space=None, act_space=None):
-        raise NotImplementedError
+    def render(self, mode="human"):
+        return self.env.render(mode)


 class ParallelPettingZooEnv(MultiAgentEnv):
    def __init__(self, env):
        self.par_env = env
        # agent idx list
-        self.agents = self.par_env.agents
+        self.agents = self.par_env.possible_agents

        # Get dictionaries of obs_spaces and act_spaces
        self.observation_spaces = self.par_env.observation_spaces
@ -242,17 +171,8 @@ class ParallelPettingZooEnv(MultiAgentEnv):
        return self.par_env.reset()

    def step(self, action_dict):
-        aobs, arew, adones, ainfo = self.par_env.step(action_dict)
-        obss = {}
-        rews = {}
-        dones = {}
-        infos = {}
-        for agent in action_dict:
-            obss[agent] = aobs[agent]
-            rews[agent] = arew[agent]
-            dones[agent] = adones[agent]
-            infos[agent] = ainfo[agent]
-        dones["__all__"] = all(adones.values())
+        obss, rews, dones, infos = self.par_env.step(action_dict)
+        dones["__all__"] = all(dones.values())
        return obss, rews, dones, infos

    def close(self):
--- a/rllib/examples/pettingzoo_env.py
+++ b/rllib/examples/pettingzoo_env.py
@ -1,12 +1,13 @@
 from copy import deepcopy
 from numpy import float32
 import os
-from pettingzoo.butterfly import pistonball_v0
 from supersuit import normalize_obs_v0, dtype_v0, color_reduction_v0

 import ray
 from ray.rllib.agents.registry import get_agent_class
 from ray.rllib.env import PettingZooEnv
+from pettingzoo.butterfly import pistonball_v1
+
 from ray.tune.registry import register_env

 if __name__ == "__main__":
@ -22,7 +23,7 @@ if __name__ == "__main__":

    # function that outputs the environment you wish to register.
    def env_creator(config):
-        env = pistonball_v0.env(local_ratio=config.get("local_ratio", 0.2))
+        env = pistonball_v1.env(local_ratio=config.get("local_ratio", 0.2))
        env = dtype_v0(env, dtype=float32)
        env = color_reduction_v0(env, mode="R")
        env = normalize_obs_v0(env)
--- a/rllib/tests/test_pettingzoo_env.py
+++ b/rllib/tests/test_pettingzoo_env.py
@ -6,7 +6,7 @@ from ray.tune.registry import register_env
 from ray.rllib.env import PettingZooEnv
 from ray.rllib.agents.registry import get_agent_class

-from pettingzoo.mpe import simple_spread_v1
+from pettingzoo.mpe import simple_spread_v2


 class TestPettingZooEnv(unittest.TestCase):
@ -17,13 +17,14 @@ class TestPettingZooEnv(unittest.TestCase):
        ray.shutdown()

    def test_pettingzoo_env(self):
-        register_env("prison", lambda _: PettingZooEnv(simple_spread_v1.env()))
+        register_env("simple_spread",
+                     lambda _: PettingZooEnv(simple_spread_v2.env()))

        agent_class = get_agent_class("PPO")

        config = deepcopy(agent_class._default_config)

-        test_env = PettingZooEnv(simple_spread_v1.env())
+        test_env = PettingZooEnv(simple_spread_v2.env())
        obs_space = test_env.observation_space
        act_space = test_env.action_space
        test_env.close()
@ -43,7 +44,7 @@ class TestPettingZooEnv(unittest.TestCase):
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False

-        agent = agent_class(env="prison", config=config)
+        agent = agent_class(env="simple_spread", config=config)
        agent.train()