From 73f5c4039b2bc70b8590f6408ff6523b8d652459 Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Mon, 4 Oct 2021 13:23:51 +0200
Subject: [PATCH] [RLlib] Fix flakey test_a3c, test_maml, test_apex_dqn.
 (#19035)

---
 rllib/agents/dqn/r2d2.py   |  2 ++
 rllib/utils/multi_agent.py | 21 ++++++++++++++++-----
 rllib/utils/test_utils.py  | 37 ++++++++++++++++++++-----------------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/rllib/agents/dqn/r2d2.py b/rllib/agents/dqn/r2d2.py
index 19710f60e..d568272e9 100644
--- a/rllib/agents/dqn/r2d2.py
+++ b/rllib/agents/dqn/r2d2.py
@@ -68,6 +68,8 @@ DEFAULT_CONFIG = dqn.DQNTrainer.merge_trainer_configs(
         # === Hyperparameters from the paper [1] ===
         # Size of the replay buffer (in sequences, not timesteps).
         "buffer_size": 100000,
+        # If True prioritized replay buffer will be used.
+        "prioritized_replay": False,
         # Set automatically: The number of contiguous environment steps to
         # replay at once. Will be calculated via
         # model->max_seq_len + burn_in.
diff --git a/rllib/utils/multi_agent.py b/rllib/utils/multi_agent.py
index b23726cb3..50d5227c5 100644
--- a/rllib/utils/multi_agent.py
+++ b/rllib/utils/multi_agent.py
@@ -1,9 +1,13 @@
+from typing import Tuple
+
 from ray.rllib.policy.policy import PolicySpec
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
-from ray.rllib.utils.typing import PartialTrainerConfigDict
+from ray.rllib.utils.typing import MultiAgentPolicyConfigDict, \
+    PartialTrainerConfigDict
 
 
-def check_multi_agent(config: PartialTrainerConfigDict):
+def check_multi_agent(config: PartialTrainerConfigDict) -> \
+        Tuple[MultiAgentPolicyConfigDict, bool]:
     """Checks, whether a (partial) config defines a multi-agent setup.
 
     Args:
@@ -11,18 +15,25 @@ def check_multi_agent(config: PartialTrainerConfigDict):
             to check for multi-agent.
 
     Returns:
-        Tuple[MultiAgentPolicyConfigDict, bool]: The resulting (all
-            fixed) multi-agent policy dict and whether we have a
-            multi-agent setup or not.
+        The resulting (all fixed) multi-agent policy dict and whether we
+            have a multi-agent setup or not.
     """
     multiagent_config = config["multiagent"]
     policies = multiagent_config.get("policies")
+
+    # Nothing specified in config dict -> Assume simple single agent setup
+    # with DEFAULT_POLICY_ID as only policy.
     if not policies:
         policies = {DEFAULT_POLICY_ID}
+    # Policies given as set (of PolicyIDs) -> Setup each policy automatically
+    # via empty PolicySpec (will make RLlib infer obs- and action spaces
+    # as well as the Policy's class).
     if isinstance(policies, set):
         policies = multiagent_config["policies"] = {
             pid: PolicySpec()
             for pid in policies
         }
+    # Is this a multi-agent setup? True, iff DEFAULT_POLICY_ID is only
+    # PolicyID found in policies dict.
     is_multiagent = len(policies) > 1 or DEFAULT_POLICY_ID not in policies
     return policies, is_multiagent
diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
index 297d8e222..aab52b533 100644
--- a/rllib/utils/test_utils.py
+++ b/rllib/utils/test_utils.py
@@ -1,6 +1,6 @@
 from collections import Counter
 import copy
-import gym
+from gym.spaces import Box
 import logging
 import numpy as np
 import random
@@ -297,7 +297,7 @@ def check_compute_single_action(trainer,
             call_kwargs["full_fetch"] = full_fetch
 
         obs = obs_space.sample()
-        if isinstance(obs_space, gym.spaces.Box):
+        if isinstance(obs_space, Box):
             obs = np.clip(obs, -1.0, 1.0)
         state_in = None
         if include_state:
@@ -368,23 +368,24 @@ def check_compute_single_action(trainer,
             for si, so in zip(state_in, state_out):
                 check(list(si.shape), so.shape)
 
-        # Test whether unsquash/clipping works: Both should push the action
-        # to certainly be within the space's bounds.
-        if not action_space.contains(action):
-            if clip or unsquash or not isinstance(action_space,
-                                                  gym.spaces.Box):
+        # Test whether unsquash/clipping works on the Trainer's
+        # compute_single_action method: Both flags should force the action
+        # to be within the space's bounds.
+        if method_to_test == "single" and what == trainer:
+            if not action_space.contains(action) and \
+                    (clip or unsquash or not isinstance(action_space, Box)):
                 raise ValueError(
                     f"Returned action ({action}) of trainer/policy {what} "
                     f"not in Env's action_space {action_space}")
-        # We are operating in normalized space: Expect only smaller action
-        # values.
-        if isinstance(action_space, gym.spaces.Box) and not unsquash and \
-                what.config.get("normalize_actions") and \
-                np.any(np.abs(action) > 10.0):
-            raise ValueError(
-                f"Returned action ({action}) of trainer/policy {what} "
-                "should be in normalized space, but seems too large/small for "
-                "that!")
+            # We are operating in normalized space: Expect only smaller action
+            # values.
+            if isinstance(action_space, Box) and not unsquash and \
+                    what.config.get("normalize_actions") and \
+                    np.any(np.abs(action) > 3.0):
+                raise ValueError(
+                    f"Returned action ({action}) of trainer/policy {what} "
+                    "should be in normalized space, but seems too large/small "
+                    "for that!")
 
     # Loop through: Policy vs Trainer; Different API methods to calculate
     # actions; unsquash option; clip option; full fetch or not.
@@ -501,7 +502,9 @@ def check_train_results(train_results):
     # Make sure we have a default_policy key if we are not in a
     # multi-agent setup.
     if not is_multi_agent:
-        assert DEFAULT_POLICY_ID in learner_info, \
+        # APEX algos sometimes have an empty learner info dict (no metrics
+        # collected yet).
+        assert len(learner_info) == 0 or DEFAULT_POLICY_ID in learner_info, \
             f"'{DEFAULT_POLICY_ID}' not found in " \
             f"train_results['infos']['learner'] ({learner_info})!"