[RLlib] Better error messages and hints; + failure-mode tests; (#18466)

2025-03-06 02:21:39 -05:00 · 2021-09-10 16:52:47 +02:00 · 2021-09-10 16:52:47 +02:00 · 3f89f35e52
commit 3f89f35e52
parent ead02b21b9
7 changed files with 175 additions and 41 deletions
--- a/rllib/agents/trainer.py
+++ b/rllib/agents/trainer.py
@ -29,6 +29,7 @@ from ray.rllib.utils.annotations import Deprecated, DeveloperAPI, override, \
    PublicAPI
 from ray.rllib.utils.debug import update_global_seed_if_necessary
 from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
+from ray.rllib.utils.error import EnvError, ERR_MSG_INVALID_ENV_DESCRIPTOR
 from ray.rllib.utils.framework import try_import_tf, TensorStructType
 from ray.rllib.utils.from_config import from_config
 from ray.rllib.utils.multi_agent import check_multi_agent
@ -698,8 +699,16 @@ class Trainer(Trainable):
                self.env_creator = _global_registry.get(ENV_CREATOR, env)
            # A class specifier.
            elif "." in env:
-                self.env_creator = \
-                    lambda env_context: from_config(env, env_context)
+
+                def env_creator_from_classpath(env_context):
+                    try:
+                        env_obj = from_config(env, env_context)
+                    except ValueError:
+                        raise EnvError(
+                            ERR_MSG_INVALID_ENV_DESCRIPTOR.format(env))
+                    return env_obj
+
+                self.env_creator = env_creator_from_classpath
            # Try gym/PyBullet/Vizdoom.
            else:
                self.env_creator = functools.partial(
--- a/rllib/env/utils.py
+++ b/rllib/env/utils.py
@ -4,6 +4,7 @@ import os
 from ray.rllib.env.env_context import EnvContext
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils import add_mixins
+from ray.rllib.utils.error import ERR_MSG_INVALID_ENV_DESCRIPTOR, EnvError


 def gym_env_creator(env_context: EnvContext, env_descriptor: str):
@ -50,25 +51,7 @@ def gym_env_creator(env_context: EnvContext, env_descriptor: str):
    try:
        return gym.make(env_descriptor, **env_context)
    except gym.error.Error:
-        error_msg = f"The env string you provided ('{env_descriptor}') is:" + \
-            """
-a) Not a supported/installed environment.
-b) Not a tune-registered environment creator.
-c) Not a valid env class string.
-
-Try one of the following:
-a) For Atari support: `pip install gym[atari] atari_py`.
-   For VizDoom support: Install VizDoom
-   (https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md) and
-   `pip install vizdoomgym`.
-   For PyBullet support: `pip install pybullet`.
-b) To register your custom env, do `from ray import tune;
-   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
-   Then in your config, do `config['env'] = [name]`.
-c) Make sure you provide a fully qualified classpath, e.g.:
-   `ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv`
-"""
-        raise gym.error.Error(error_msg)
+        raise EnvError(ERR_MSG_INVALID_ENV_DESCRIPTOR.format(env_descriptor))


 class VideoMonitor(wrappers.Monitor):
--- a/rllib/evaluation/rollout_worker.py
+++ b/rllib/evaluation/rollout_worker.py
@ -33,7 +33,8 @@ from ray.rllib.utils import force_list, merge_dicts
 from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.utils.debug import summarize, update_global_seed_if_necessary
 from ray.rllib.utils.deprecation import deprecation_warning
-from ray.rllib.utils.error import EnvError
+from ray.rllib.utils.error import EnvError, ERR_MSG_NO_GPUS, \
+    HOWTO_CHANGE_CONFIG
 from ray.rllib.utils.filter import get_filter, Filter
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.sgd import do_minibatch_sgd
@ -556,16 +557,16 @@ class RolloutWorker(ParallelIteratorWorker):
                ray.worker._mode() != ray.worker.LOCAL_MODE and \
                not policy_config.get("_fake_gpus"):

+            devices = []
            if policy_config.get("framework") in ["tf2", "tf", "tfe"]:
-                if len(get_tf_gpu_devices()) < num_gpus:
-                    raise RuntimeError(
-                        f"Not enough GPUs found for num_gpus={num_gpus}! "
-                        f"Found only these IDs: {get_tf_gpu_devices()}.")
+                devices = get_tf_gpu_devices()
            elif policy_config.get("framework") == "torch":
-                if torch.cuda.device_count() < num_gpus:
-                    raise RuntimeError(
-                        f"Not enough GPUs found ({torch.cuda.device_count()}) "
-                        f"for num_gpus={num_gpus}!")
+                devices = list(range(torch.cuda.device_count()))
+
+            if len(devices) < num_gpus:
+                raise RuntimeError(
+                    ERR_MSG_NO_GPUS.format(len(devices), devices) +
+                    HOWTO_CHANGE_CONFIG)
        # Warn, if running in local-mode and actual GPUs (not faked) are
        # requested.
        elif ray.is_initialized() and \
--- a/rllib/tests/test_gpus.py
+++ b/rllib/tests/test_gpus.py
@ -51,8 +51,7 @@ class TestGPUs(unittest.TestCase):
                            print("direct RLlib")
                            self.assertRaisesRegex(
                                RuntimeError,
-                                "Not enough GPUs found.+for "
-                                f"num_gpus={num_gpus}",
+                                "Found 0 GPUs on your machine",
                                lambda: PGTrainer(config, env="CartPole-v0"),
                            )
                        # If actual_gpus >= num_gpus or faked,
--- a/rllib/utils/error.py
+++ b/rllib/utils/error.py
@ -11,3 +11,50 @@ class UnsupportedSpaceException(Exception):
 class EnvError(Exception):
    """Error if we encounter an error during RL environment validation."""
    pass
+
+
+# -------
+# Error messages
+# -------
+
+# Message explaining there are no GPUs available for the
+# num_gpus=n or num_gpus_per_worker=m settings.
+ERR_MSG_NO_GPUS = \
+    """Found {} GPUs on your machine (GPU devices found: {})! If your machine
+    does not have any GPUs, you should set the config keys `num_gpus` and
+    `num_gpus_per_worker` to 0 (they may be set to 1 by default for your
+    particular RL algorithm)."""
+
+ERR_MSG_INVALID_ENV_DESCRIPTOR = \
+    """The env string you provided ('{}') is:
+a) Not a supported/installed environment.
+b) Not a tune-registered environment creator.
+c) Not a valid env class string.
+
+Try one of the following:
+a) For Atari support: `pip install gym[atari] atari_py`.
+   For VizDoom support: Install VizDoom
+   (https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md) and
+   `pip install vizdoomgym`.
+   For PyBullet support: `pip install pybullet`.
+b) To register your custom env, do `from ray import tune;
+   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
+   Then in your config, do `config['env'] = [name]`.
+c) Make sure you provide a fully qualified classpath, e.g.:
+   `ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv`
+"""
+
+# -------
+# HOWTO_ strings can be added to any error/warning/into message
+# to eplain to the user, how to actually fix the encountered problem.
+# -------
+
+# HOWTO change the RLlib config, depending on how user runs the job.
+HOWTO_CHANGE_CONFIG = """
+To change the config for the `rllib train|rollout` command, use
+  `--config={'[key]': '[value]'}` on the command line.
+To change the config for `tune.run()` in a script: Modify the python dict
+  passed to `tune.run(config=[...])`.
+To change the config for an RLlib Trainer instance: Modify the python dict
+  passed to the Trainer's constructor, e.g. `PPOTrainer(config=[...])`.
+"""
--- a/rllib/utils/from_config.py
+++ b/rllib/utils/from_config.py
@ -10,8 +10,8 @@ from ray.rllib.utils import force_list, merge_dicts


 def from_config(cls, config=None, **kwargs):
-    """
-    Uses the given config to create an object.
+    """Uses the given config to create an object.
+
    If `config` is a dict, an optional "type" key can be used as a
    "constructor hint" to specify a certain class of the object.
    If `config` is not a dict, `config`'s value is used directly as this
@ -37,7 +37,7 @@ def from_config(cls, config=None, **kwargs):

    Args:
        cls (class): The class to build an instance for (from `config`).
-        config (Optional[dict,str]): The config dict or type-string or
+        config (Optional[dict, str]): The config dict or type-string or
            filename.

    Keyword Args:
@ -143,17 +143,27 @@ def from_config(cls, config=None, **kwargs):
            else:
                return obj

-            # Test for absolute module.class specifier.
+            # Test for absolute module.class path specifier.
            if type_.find(".") != -1:
                module_name, function_name = type_.rsplit(".", 1)
                try:
                    module = importlib.import_module(module_name)
                    constructor = getattr(module, function_name)
-                except (ModuleNotFoundError, ImportError):
+                # Module not found.
+                except (ModuleNotFoundError, ImportError, AttributeError):
                    pass
+
            # If constructor still not found, try attaching cls' module,
            # then look for type_ in there.
            if constructor is None:
+                if isinstance(cls, str):
+                    # Module found, but doesn't have the specified
+                    # c'tor/function.
+                    raise ValueError(
+                        f"Full classpath specifier ({type_}) must be a valid "
+                        "full [module].[class] string! E.g.: "
+                        "`my.cool.module.MyCoolClass`.")
+
                try:
                    module = importlib.import_module(cls.__module__)
                    constructor = getattr(module, type_)
@ -166,12 +176,12 @@ def from_config(cls, config=None, **kwargs):
                        constructor = getattr(module, type_)
                    except (ModuleNotFoundError, ImportError, AttributeError):
                        pass
+
            if constructor is None:
                raise ValueError(
-                    "String specifier ({}) in `from_config` must be a "
-                    "filename, a module+class, a class within '{}', or a key "
-                    "into {}.__type_registry__!".format(
-                        type_, cls.__module__, cls.__name__))
+                    f"String specifier ({type_}) must be a valid filename, "
+                    f"a [module].[class], a class within '{cls.__module__}', "
+                    f"or a key into {cls.__name__}.__type_registry__!")

    if not constructor:
        raise TypeError(
--- a/rllib/utils/tests/test_errors.py
+++ b/rllib/utils/tests/test_errors.py
@ -0,0 +1,85 @@
+import unittest
+
+import ray
+import ray.rllib.agents.impala as impala
+import ray.rllib.agents.pg as pg
+from ray.rllib.utils.error import EnvError
+from ray.rllib.utils.test_utils import framework_iterator
+
+
+class TestErrors(unittest.TestCase):
+    """Tests various failure-modes, making sure we produce meaningful errmsgs.
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        ray.shutdown()
+
+    def test_no_gpus_error(self):
+        """Tests errors related to no-GPU/too-few GPUs/etc.
+
+        This test will only work ok on a CPU-only machine.
+        """
+
+        config = impala.DEFAULT_CONFIG.copy()
+        env = "CartPole-v0"
+
+        for _ in framework_iterator(config):
+            self.assertRaisesRegex(
+                RuntimeError,
+                # (?s): "dot matches all" (also newlines).
+                "(?s)Found 0 GPUs on your machine.+To change the config",
+                lambda: impala.ImpalaTrainer(config=config, env=env),
+            )
+
+    def test_bad_envs(self):
+        """Tests different "bad env" errors.
+        """
+        config = pg.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0
+
+        # Non existing/non-registered gym env string.
+        env = "Alien-Attack-v42"
+        for _ in framework_iterator(config):
+            self.assertRaisesRegex(
+                EnvError,
+                f"The env string you provided \\('{env}'\\) is",
+                lambda: pg.PGTrainer(config=config, env=env),
+            )
+
+        # Malformed gym env string (must have v\d at end).
+        env = "Alien-Attack-part-42"
+        for _ in framework_iterator(config):
+            self.assertRaisesRegex(
+                EnvError,
+                f"The env string you provided \\('{env}'\\) is",
+                lambda: pg.PGTrainer(config=config, env=env),
+            )
+
+        # Non-existing class in a full-class-path.
+        env = "ray.rllib.examples.env.random_env.RandomEnvThatDoesntExist"
+        for _ in framework_iterator(config):
+            self.assertRaisesRegex(
+                EnvError,
+                f"The env string you provided \\('{env}'\\) is",
+                lambda: pg.PGTrainer(config=config, env=env),
+            )
+
+        # Non-existing module inside a full-class-path.
+        env = "ray.rllib.examples.env.module_that_doesnt_exist.SomeEnv"
+        for _ in framework_iterator(config):
+            self.assertRaisesRegex(
+                EnvError,
+                f"The env string you provided \\('{env}'\\) is",
+                lambda: pg.PGTrainer(config=config, env=env),
+            )
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))