Pettingzoo environment support (#9271)

* added pettingzoo wrapper env and example * added docs, examples for pettingzoo env support * fixed pettingzoo env flake8, added test * fixed pettingzoo env import * fixed pettingzoo env import * fixed pettingzoo import issue * fixed pettingzoo test * fixed linting problem * fixed bad quotes * future proofed pettingzoo dependency * fixed ray init in pettingzoo env * lint * manual lint Co-authored-by: Eric Liang <ekhliang@gmail.com>
2025-03-06 02:21:39 -05:00 · 2020-07-06 22:32:26 -06:00 · 2020-07-06 22:32:26 -06:00 · 1425cdf834
commit 1425cdf834
parent b42d6a1ddc
7 changed files with 375 additions and 1 deletions
--- a/ci/travis/install-dependencies.sh
+++ b/ci/travis/install-dependencies.sh
@ -232,7 +232,7 @@ install_dependencies() {
      opencv-python-headless pyyaml pandas==1.0.5 requests feather-format lxml openpyxl xlrd \
      py-spy pytest pytest-timeout networkx tabulate aiohttp uvicorn dataclasses pygments werkzeug \
      kubernetes flask grpcio pytest-sugar pytest-rerunfailures pytest-asyncio scikit-learn==0.22.2 numba \
-      Pillow prometheus_client boto3)
+      Pillow prometheus_client boto3 pettingzoo)
    if [ "${OSTYPE}" != msys ]; then
      # These packages aren't Windows-compatible
      pip_packages+=(blist)  # https://github.com/DanielStutzbach/blist/issues/81#issue-391460716
--- a/doc/source/rllib-env.rst
+++ b/doc/source/rllib-env.rst
@ -203,6 +203,29 @@ Here is a simple `example training script <https://github.com/ray-project/ray/bl
 To scale to hundreds of agents, MultiAgentEnv batches policy evaluations across multiple agents internally. It can also be auto-vectorized by setting ``num_envs_per_worker > 1``.
 PettingZoo Multi-Agent Environments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 `PettingZoo <https://github.com/PettingZoo-Team/PettingZoo>`__ is a repository of over 50 diverse multi-agent environments. However, the API is note directly compatible with rllib, but it can be converted into an rllib MultiAgentEnv like in this example
 .. code-block:: python
    from ray.tune.registry import register_env
    # import the pettingzoo environment
    from pettingzoo.gamma import prison_v0
    # import rllib pettingzoo interface
    from ray.rllib.env import PettingZooEnv
    # define how to make the environment. This way takes an optinoal environment config, num_floors
    env_creator = lambda config: prison_v0.env(num_floors=config.get("num_floors", 4))
    # register that way to make the environment under an rllib name
    register_env('prison', lambda config: PettingZooEnv(env_creator(config)))
    # now you can use `prison` as an environment
    # you can pass arguments to the environment creator with the env_config option in the config
    config['env_config'] = {"num_floors": 5}
 A more complete example is here: `pettingzoo_env.py <https://github.com/ray-project/ray/blob/master/rllib/examples/pettingzoo_env.py>`__
 Rock Paper Scissors Example
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -1310,6 +1310,13 @@ py_test(
    args = ["TestSupportedMultiAgentOffPolicy"]
 )
 py_test(
    name = "tests/test_pettingzoo_env",
    tags = ["tests_dir", "tests_dir_S"],
    size = "medium",
    srcs = ["tests/test_pettingzoo_env.py"]
 )
 py_test(
    name = "tests/test_supported_spaces",
    tags = ["tests_dir", "tests_dir_S"],
--- a/rllib/env/init.py
+++ b/rllib/env/init.py
@ -2,6 +2,7 @@ from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.dm_env_wrapper import DMEnv
 from ray.rllib.env.dm_control_wrapper import DMCEnv
 from ray.rllib.env.unity3d_env import Unity3DEnv
 from ray.rllib.env.pettingzoo_env import PettingZooEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.env.external_env import ExternalEnv
 from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
@ -20,6 +21,7 @@ __all__ = [
    "DMEnv",
    "DMCEnv",
    "Unity3DEnv",
    "PettingZooEnv",
    "PolicyClient",
    "PolicyServerInput",
 ]
--- a/rllib/env/pettingzoo_env.py
+++ b/rllib/env/pettingzoo_env.py
@ -0,0 +1,207 @@
 from .multi_agent_env import MultiAgentEnv
 class PettingZooEnv(MultiAgentEnv):
    """An interface to the PettingZoo MARL environment library.
    See: https://github.com/PettingZoo-Team/PettingZoo
    Inherits from MultiAgentEnv and exposes a given AEC
    (actor-environment-cycle) game from the PettingZoo project via the
    MultiAgentEnv public API.
    It reduces the class of AEC games to Partially Observable Markov (POM)
    games by imposing the following important restrictions onto an AEC
    environment:
    1. Each agent steps in order specified in agents list (unless they are
       done, in which case, they should be skipped).
    2. Agents act simultaneously (-> No hard-turn games like chess).
    3. All agents have the same action_spaces and observation_spaces.
       Note: If, within your aec game, agents do not have homogeneous action /
       observation spaces, apply SuperSuit wrappers
       to apply padding functionality: https://github.com/PettingZoo-Team/
       SuperSuit#built-in-multi-agent-only-functions
    4. Environments are positive sum games (-> Agents are expected to cooperate
       to maximize reward). This isn't a hard restriction, it just that
       standard algorithms aren't expected to work well in highly competitive
       games.
    Examples:
        >>> from pettingzoo.gamma import prison_v0
        >>> env = POMGameEnv(env_creator=prison_v0})
        >>> obs = env.reset()
        >>> print(obs)
        {
            "0": [110, 119],
            "1": [105, 102],
            "2": [99, 95],
        }
        >>> obs, rewards, dones, infos = env.step(
            action_dict={
                "0": 1, "1": 0, "2": 2,
            })
        >>> print(rewards)
        {
            "0": 0,
            "1": 1,
            "2": 0,
        }
        >>> print(dones)
        {
            "0": False,    # agent 0 is still running
            "1": True,     # agent 1 is done
            "__all__": False,  # the env is not done
        }
        >>> print(infos)
        {
            "0": {},  # info for agent 0
            "1": {},  # info for agent 1
        }
    """
    def __init__(self, env):
        """
        Parameters:
        -----------
        env:  AECenv object.
        """
        self.aec_env = env
        # agent idx list
        self.agents = self.aec_env.agents
        # Get dictionaries of obs_spaces and act_spaces
        self.observation_spaces = self.aec_env.observation_spaces
        self.action_spaces = self.aec_env.action_spaces
        # Get first observation space, assuming all agents have equal space
        self.observation_space = self.observation_spaces[self.agents[0]]
        # Get first action space, assuming all agents have equal space
        self.action_space = self.action_spaces[self.agents[0]]
        assert all(obs_space == self.observation_space
                   for obs_space
                   in self.aec_env.observation_spaces.values()), \
            "Observation spaces for all agents must be identical. Perhaps " \
            "SuperSuit's pad_observations wrapper can help (useage: " \
            "`supersuit.aec_wrappers.pad_observations(env)`"
        assert all(act_space == self.action_space
                   for act_space in self.aec_env.action_spaces.values()), \
            "Action spaces for all agents must be identical. Perhaps " \
            "SuperSuit's pad_action_space wrapper can help (useage: " \
            "`supersuit.aec_wrappers.pad_action_space(env)`"
        self.rewards = {}
        self.dones = {}
        self.obs = {}
        self.infos = {}
        _ = self.reset()
    def _init_dicts(self):
        # initialize with zero
        self.rewards = dict(zip(self.agents, [0 for _ in self.agents]))
        # initialize with False
        self.dones = dict(zip(self.agents, [False for _ in self.agents]))
        self.dones["__all__"] = False
        # initialize with None info object
        self.infos = dict(zip(self.agents, [{} for _ in self.agents]))
        # initialize empty observations
        self.obs = dict(zip(self.agents, [None for _ in self.agents]))
    def reset(self):
        """
        Resets the env and returns observations from ready agents.
        Returns:
            obs (dict): New observations for each ready agent.
        """
        # 1. Reset environment; agent pointer points to first agent.
        self.aec_env.reset(observe=False)
        # 2. Copy agents from environment
        self.agents = self.aec_env.agents
        # 3. Reset dictionaries
        self._init_dicts()
        # 4. Get initial observations
        for agent in self.agents:
            # For each agent get initial observations
            self.obs[agent] = self.aec_env.observe(agent)
        return self.obs
    def step(self, action_dict):
        """
        Executes input actions from RL agents and returns observations from
        environment agents.
        The returns are dicts mapping from agent_id strings to values. The
        number of agents in the env can vary over time.
        Returns
        -------
            obs (dict): New observations for each ready agent.
            rewards (dict): Reward values for each ready agent. If the
                episode is just started, the value will be None.
            dones (dict): Done values for each ready agent. The special key
                "__all__" (required) is used to indicate env termination.
            infos (dict): Optional info values for each agent id.
        """
        env_done = False
        # iterate over self.agents
        for agent in self.agents:
            # Execute only for agents that have not been done in previous steps
            if agent in action_dict.keys():
                if not env_done:
                    assert agent == self.aec_env.agent_selection, \
                        f"environment has a nontrivial ordering, and " \
                        "cannot be used with the POMGameEnv wrapper\"" \
                        "nCurrent agent: {self.aec_env.agent_selection}" \
                        "\nExpected agent: {agent}"
                    # Execute agent action in environment
                    self.obs[agent] = self.aec_env.step(
                        action_dict[agent], observe=True)
                    if all(self.aec_env.dones.values()):
                        env_done = True
                        self.dones["__all__"] = True
                else:
                    self.obs[agent] = self.aec_env.observe(agent)
                # Get reward
                self.rewards[agent] = self.aec_env.rewards[agent]
                # Update done status
                self.dones[agent] = self.aec_env.dones[agent]
            # For agents with done = True, remove from dones, rewards and
            # observations.
            else:
                del self.dones[agent]
                del self.rewards[agent]
                del self.obs[agent]
                del self.infos[agent]
        # update self.agents
        self.agents = list(action_dict.keys())
        # Update infos stepwise
        for agent in self.agents:
            self.infos[agent] = self.aec_env.infos[agent]
        return self.obs, self.rewards, self.dones, self.infos
    def render(self, mode="human"):
        self.aec_env.render(mode=mode)
    def close(self):
        self.aec_env.close()
    def with_agent_groups(self, groups, obs_space=None, act_space=None):
        raise NotImplementedError
--- a/rllib/examples/pettingzoo_env.py
+++ b/rllib/examples/pettingzoo_env.py
@ -0,0 +1,82 @@
 from copy import deepcopy
 import ray
 try:
    from ray.rllib.agents.agent import get_agent_class
 except ImportError:
    from ray.rllib.agents.registry import get_agent_class
 from ray.tune.registry import register_env
 from ray.rllib.env import PettingZooEnv
 from pettingzoo.gamma import prison_v0
 from supersuit.aec_wrappers import normalize_obs, dtype, color_reduction
 from numpy import float32
 if __name__ == "__main__":
    """For this script, you need:
    1. Algorithm name and according module, e.g.: "PPo" + agents.ppo as agent
    2. Name of the aec game you want to train on, e.g.: "prison".
    3. num_cpus
    4. num_rollouts
    Does require SuperSuit
    """
    alg_name = "PPO"
    # function that outputs the environment you wish to register.
    def env_creator(config):
        env = prison_v0.env(num_floors=config.get("num_floors", 4))
        env = dtype(env, dtype=float32)
        env = color_reduction(env, dtype=float32)
        env = normalize_obs(env, mode="R")
        return env
    num_cpus = 1
    num_rollouts = 2
    # 1. Gets default training configuration and specifies the POMgame to load.
    config = deepcopy(get_agent_class(alg_name)._default_config)
    # 2. Set environment config. This will be passed to
    # the env_creator function via the register env lambda below
    config["env_config"] = {"num_floors": 5}
    # 3. Register env
    register_env("prison", lambda config: PettingZooEnv(env_creator(config)))
    # 4. Extract space dimensions
    test_env = PettingZooEnv(env_creator({}))
    obs_space = test_env.observation_space
    act_space = test_env.action_space
    # 5. Configuration for multiagent setup with policy sharing:
    config["multiagent"] = {
        "policies": {
            # the first tuple value is None -> uses default policy
            "av": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: "av"
    }
    config["log_level"] = "DEBUG"
    config["num_workers"] = 1
    # Fragment length, collected at once from each worker and for each agent!
    config["sample_batch_size"] = 30
    # Training batch size -> Fragments are concatenated up to this point.
    config["train_batch_size"] = 200
    # After n steps, force reset simulation
    config["horizon"] = 200
    # Default: False
    config["no_done_at_end"] = False
    # Info: If False, each agents trajectory is expected to have
    # maximum one done=True in the last step of the trajectory.
    # If no_done_at_end = True, environment is not resetted
    # when dones[__all__]= True.
    # 6. Initialize ray and trainer object
    ray.init(num_cpus=num_cpus + 1)
    trainer = get_agent_class(alg_name)(env="prison", config=config)
    # 7. Train once
    trainer.train()
    test_env.reset()
--- a/rllib/tests/test_pettingzoo_env.py
+++ b/rllib/tests/test_pettingzoo_env.py
@ -0,0 +1,53 @@
 import unittest
 from copy import deepcopy
 import ray
 from ray.tune.registry import register_env
 from ray.rllib.env import PettingZooEnv
 from ray.rllib.agents.registry import get_agent_class
 from pettingzoo.mpe import simple_spread_v0
 class TestPettingZooEnv(unittest.TestCase):
    def setUp(self) -> None:
        ray.init()
    def tearDown(self) -> None:
        ray.shutdown()
    def test_pettingzoo_env(self):
        register_env("prison", lambda _: PettingZooEnv(simple_spread_v0.env()))
        agent_class = get_agent_class("PPO")
        config = deepcopy(agent_class._default_config)
        test_env = PettingZooEnv(simple_spread_v0.env())
        obs_space = test_env.observation_space
        act_space = test_env.action_space
        test_env.close()
        config["multiagent"] = {
            "policies": {
                # the first tuple value is None -> uses default policy
                "av": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": lambda agent_id: "av"
        }
        config["log_level"] = "DEBUG"
        config["num_workers"] = 0
        config["rollout_fragment_length"] = 30
        config["train_batch_size"] = 200
        config["horizon"] = 200  # After n steps, force reset simulation
        config["no_done_at_end"] = False
        agent = agent_class(env="prison", config=config)
        agent.train()
 if __name__ == "__main__":
    import pytest
    import sys
    sys.exit(pytest.main(["-v", __file__]))