[RLlib] MADDPG: Move into main algorithms folder and add proper unit and learning tests. (#24579)

2025-03-05 10:01:43 -05:00 · 2022-05-24 12:53:53 +02:00 · 2022-05-24 12:53:53 +02:00 · e73c37cc17
commit e73c37cc17
parent e7e75b46e1
11 changed files with 111 additions and 28 deletions
--- a/doc/source/rllib/rllib-algorithms.rst
+++ b/doc/source/rllib/rllib-algorithms.rst
@ -791,13 +791,13 @@ Tuned examples: `Two-step game <https://github.com/ray-project/ray/blob/master/r
 Multi-Agent Deep Deterministic Policy Gradient (MADDPG)
 -------------------------------------------------------
 |tensorflow|
-`[paper] <https://arxiv.org/abs/1706.02275>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/agents/maddpg/maddpg.py>`__ MADDPG is a DDPG centralized/shared critic algorithm. Code here is adapted from https://github.com/openai/maddpg to integrate with RLlib multi-agent APIs. Please check `justinkterry/maddpg-rllib <https://github.com/jkterry1/maddpg-rllib>`__ for examples and more information. Note that the implementation here is based on OpenAI's, and is intended for use with the discrete MPE environments. Please also note that people typically find this method difficult to get to work, even with all applicable optimizations for their environment applied. This method should be viewed as for research purposes, and for reproducing the results of the paper introducing it.
+`[paper] <https://arxiv.org/abs/1706.02275>`__ `[implementation] <https://github.com/ray-project/ray/blob/master/rllib/algorithms/maddpg/maddpg.py>`__ MADDPG is a DDPG centralized/shared critic algorithm. Code here is adapted from https://github.com/openai/maddpg to integrate with RLlib multi-agent APIs. Please check `justinkterry/maddpg-rllib <https://github.com/jkterry1/maddpg-rllib>`__ for examples and more information. Note that the implementation here is based on OpenAI's, and is intended for use with the discrete MPE environments. Please also note that people typically find this method difficult to get to work, even with all applicable optimizations for their environment applied. This method should be viewed as for research purposes, and for reproducing the results of the paper introducing it.

 **MADDPG-specific configs** (see also `common configs <rllib-training.html#common-parameters>`__):

 Tuned examples: `Multi-Agent Particle Environment <https://github.com/wsjeon/maddpg-rllib/tree/master/plots>`__, `Two-step game <https://github.com/ray-project/ray/blob/master/rllib/examples/two_step_game.py>`__

-.. literalinclude:: ../../../rllib/agents/maddpg/maddpg.py
+.. literalinclude:: ../../../rllib/algorithms/maddpg/maddpg.py
   :language: python
   :start-after: __sphinx_doc_begin__
   :end-before: __sphinx_doc_end__
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -379,6 +379,17 @@ py_test(
    args = ["--yaml-dir=tuned_examples/impala"]
 )

+# MADDPG
+py_test(
+    name = "learning_tests_two_step_game_maddpg",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "tf_only", "no_tf_eager_tracing", "learning_tests", "learning_tests_discrete"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/maddpg/two-step-game-maddpg.yaml"],
+    args = ["--yaml-dir=tuned_examples/maddpg", "--framework=tf"]
+)
+
 # Working, but takes a long time to learn (>15min).
 # Removed due to Higher API conflicts with Pytorch-Import tests
 ## MB-MPO
@ -729,7 +740,7 @@ py_test(
 py_test(
    name = "test_dreamer",
    tags = ["team:ml", "trainers_dir"],
-    size = "small",
+    size = "medium",
    srcs = ["algorithms/dreamer/tests/test_dreamer.py"]
 )

@ -775,6 +786,14 @@ py_test(
    srcs = ["algorithms/marwil/tests/test_bc.py"]
 )

+# MADDPGTrainer
+py_test(
+    name = "test_maddpg",
+    tags = ["team:ml", "trainers_dir"],
+    size = "medium",
+    srcs = ["algorithms/maddpg/tests/test_maddpg.py"]
+)
+
 # MAMLTrainer
 py_test(
    name = "test_maml",
@ -2951,15 +2970,6 @@ py_test(
    args = ["--as-test", "--mixed-torch-tf", "--stop-reward=450.0"]
 )

-py_test(
-    name = "examples/two_step_game_maddpg",
-    main = "examples/two_step_game.py",
-    tags = ["team:ml", "examples", "examples_T"],
-    size = "medium",
-    srcs = ["examples/two_step_game.py"],
-    args = ["--as-test", "--stop-reward=7.1", "--run=MADDPG"]
-)
-
 py_test(
    name = "examples/two_step_game_pg_tf",
    main = "examples/two_step_game.py",
--- a/rllib/README.rst
+++ b/rllib/README.rst
@ -105,7 +105,7 @@ Multi-agent:
 - `Single-Player Alpha Zero (contrib/AlphaZero)  <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#alphazero>`__ 
 - `Parameter Sharing <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#parameter>`__ 
 - `QMIX Monotonic Value Factorisation (QMIX, VDN, IQN)) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#qmix>`__ 
- `Multi-Agent Deep Deterministic Policy Gradient (contrib/MADDPG) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#maddpg>`__ 
+- `Multi-Agent Deep Deterministic Policy Gradient (MADDPG) <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#maddpg>`__
 - `Shared Critic Methods <https://docs.ray.io/en/master/rllib/rllib-algorithms.html#sc>`__ 

 Others:  
--- a/rllib/agents/maddpg/init.py
+++ b/rllib/agents/maddpg/init.py
@ -1,3 +1,19 @@
-from ray.rllib.agents.maddpg.maddpg import MADDPGTrainer, DEFAULT_CONFIG
+from ray.rllib.algorithms.maddpg.maddpg import (
+    MADDPGTrainer,
+    MADDPGTFPolicy,
+    DEFAULT_CONFIG,
+)

-__all__ = ["MADDPGTrainer", "DEFAULT_CONFIG"]
+__all__ = [
+    "MADDPGTrainer",
+    "MADDPGTFPolicy",
+    "DEFAULT_CONFIG",
+]
+
+from ray.rllib.utils.deprecation import deprecation_warning
+
+deprecation_warning(
+    "ray.rllib.agents.maddpg",
+    "ray.rllib.algorithms.maddpg",
+    error=False,
+)
--- a/rllib/algorithms/maddpg/README.md
+++ b/rllib/algorithms/maddpg/README.md
--- a/rllib/algorithms/maddpg/init.py
+++ b/rllib/algorithms/maddpg/init.py
@ -0,0 +1,3 @@
+from ray.rllib.algorithms.maddpg.maddpg import MADDPGTrainer, DEFAULT_CONFIG
+
+__all__ = ["MADDPGTrainer", "DEFAULT_CONFIG"]
--- a/rllib/algorithms/maddpg/maddpg.py
+++ b/rllib/algorithms/maddpg/maddpg.py
@ -12,12 +12,11 @@ and the README for how to run with the multi-agent particle envs.
 import logging
 from typing import Type

-from ray.rllib.agents.maddpg.maddpg_tf_policy import MADDPGTFPolicy
 from ray.rllib.algorithms.dqn.dqn import DQNTrainer
-from ray.rllib.agents.trainer import COMMON_CONFIG, with_common_config
+from ray.rllib.algorithms.maddpg.maddpg_tf_policy import MADDPGTFPolicy
+from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch
-from ray.rllib.utils import merge_dicts
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.typing import TrainerConfigDict
 from ray.rllib.utils.deprecation import DEPRECATED_VALUE
@ -77,6 +76,8 @@ DEFAULT_CONFIG = with_common_config({
        "capacity": int(1e6),
        # How many steps of the model to sample before learning starts.
        "learning_starts": 1024 * 25,
+        # Force lockstep replay mode for MADDPG.
+        "replay_mode": "lockstep",
    },
    # Observation compression. Note that compression makes simulation slow in
    # MPE.
@ -86,10 +87,6 @@ DEFAULT_CONFIG = with_common_config({
    # timesteps. Otherwise, the replay will proceed at the native ratio
    # determined by (train_batch_size / rollout_fragment_length).
    "training_intensity": None,
-    # Force lockstep replay mode for MADDPG.
-    "multiagent": merge_dicts(COMMON_CONFIG["multiagent"], {
-        "replay_mode": "lockstep",
-    }),

    # === Optimization ===
    # Learning rate for the critic (Q-function) optimizer.
--- a/rllib/algorithms/maddpg/maddpg_tf_policy.py
+++ b/rllib/algorithms/maddpg/maddpg_tf_policy.py
@ -43,7 +43,7 @@ class MADDPGPostprocessing:
 class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
    def __init__(self, obs_space, act_space, config):
        # _____ Initial Configuration
-        config = dict(ray.rllib.agents.maddpg.DEFAULT_CONFIG, **config)
+        config = dict(ray.rllib.algorithms.maddpg.maddpg.DEFAULT_CONFIG, **config)
        self.config = config
        self.global_step = tf1.train.get_or_create_global_step()

@ -69,11 +69,11 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
                )

        obs_space_n = [
-            _make_continuous_space(space)
+            _make_continuous_space(space or obs_space)
            for _, (_, space, _, _) in config["multiagent"]["policies"].items()
        ]
        act_space_n = [
-            _make_continuous_space(space)
+            _make_continuous_space(space or act_space)
            for _, (_, _, space, _) in config["multiagent"]["policies"].items()
        ]

--- a/rllib/algorithms/maddpg/tests/test_maddpg.py
+++ b/rllib/algorithms/maddpg/tests/test_maddpg.py
@ -0,0 +1,57 @@
+import unittest
+
+import ray
+import ray.rllib.algorithms.maddpg as maddpg
+from ray.rllib.examples.env.two_step_game import TwoStepGame
+from ray.rllib.policy.policy import PolicySpec
+from ray.rllib.utils.test_utils import (
+    check_train_results,
+    framework_iterator,
+)
+
+
+class TestMADDPG(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        ray.shutdown()
+
+    def test_maddpg_compilation(self):
+        """Test whether an MADDPGTrainer can be built with all frameworks."""
+        config = maddpg.DEFAULT_CONFIG.copy()
+        config["env"] = TwoStepGame
+        config["env_config"] = {
+            "actions_are_logits": True,
+        }
+        config["multiagent"] = {
+            "policies": {
+                "pol1": PolicySpec(
+                    config={"agent_id": 0},
+                ),
+                "pol2": PolicySpec(
+                    config={"agent_id": 1},
+                ),
+            },
+            "policy_mapping_fn": (lambda aid, **kwargs: "pol2" if aid else "pol1"),
+        }
+
+        num_iterations = 1
+
+        # Only working for tf right now.
+        for _ in framework_iterator(config, frameworks="tf"):
+            trainer = maddpg.MADDPGTrainer(config)
+            for i in range(num_iterations):
+                results = trainer.train()
+                check_train_results(results)
+                print(results)
+            trainer.stop()
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+
+    sys.exit(pytest.main(["-v", __file__]))
--- a/rllib/contrib/registry.py
+++ b/rllib/contrib/registry.py
@ -17,7 +17,7 @@ def _import_alphazero():


 def _import_maddpg():
-    from ray.rllib.agents.maddpg import maddpg
+    from ray.rllib.algorithms.maddpg import maddpg

    return maddpg.MADDPGTrainer, maddpg.DEFAULT_CONFIG

--- a/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml
+++ b/rllib/tuned_examples/maddpg/two-step-game-maddpg.yaml
@ -1,8 +1,8 @@
-two-step-game-qmix-with-qmix-mixer:
+two-step-game-maddpg:
    env: ray.rllib.examples.env.two_step_game.TwoStepGame
    run: MADDPG
    stop:
-        episode_reward_mean: 8.0
+        episode_reward_mean: 7.2
        timesteps_total: 20000
    config:
        # MADDPG only supports tf for now.