[RLlib] De-flake 3 test cases; Fix config.simple_optimizer and SampleBatch.is_training warnings. (#17321)

2025-03-06 02:21:39 -05:00 · 2021-07-27 14:39:06 -04:00 · 2021-07-27 14:39:06 -04:00 · 90b21ce27e
commit 90b21ce27e
parent e70d84953e
8 changed files with 44 additions and 57 deletions
--- a/rllib/BUILD
+++ b/rllib/BUILD
@ -157,20 +157,19 @@ py_test(
 )

 # CQL
-# Skipping due to high flakiness.
-#py_test(
-#    name = "run_regression_tests_pendulum_cql_tf",
-#    main = "tests/run_regression_tests.py",
-#    tags = ["learning_tests_tf", "learning_tests_pendulum", "flaky"],
-#    size = "large",
-#    srcs = ["tests/run_regression_tests.py"],
-#    # Include the zipped json data file as well.
-#    data = [
-#        "tuned_examples/cql/pendulum-cql.yaml",
-#        "tests/data/pendulum/enormous.zip",
-#    ],
-#    args = ["--yaml-dir=tuned_examples/cql"]
-#)
+py_test(
+    name = "run_regression_tests_pendulum_cql_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum", "flaky"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    # Include the zipped json data file as well.
+    data = [
+        "tuned_examples/cql/pendulum-cql.yaml",
+        "tests/data/pendulum/enormous.zip",
+    ],
+    args = ["--yaml-dir=tuned_examples/cql"]
+)

 py_test(
    name = "run_regression_tests_pendulum_cql_torch",
@ -505,16 +504,15 @@ py_test(
    args = ["--yaml-dir=tuned_examples/sac"]
 )

-# Skipping due to high flakiness.
-#py_test(
-#    name = "run_regression_tests_pendulum_sac_torch",
-#    main = "tests/run_regression_tests.py",
-#    tags = ["learning_tests_torch", "learning_tests_pendulum", "flaky"],
-#    size = "large",
-#    srcs = ["tests/run_regression_tests.py"],
-#    data = ["tuned_examples/sac/pendulum-sac.yaml"],
-#    args = ["--yaml-dir=tuned_examples/sac", "--framework=torch"]
-#)
+py_test(
+    name = "run_regression_tests_pendulum_sac_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_pendulum", "flaky"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/pendulum-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac", "--framework=torch"]
+)

 py_test(
    name = "run_regression_tests_transformed_actions_pendulum_sac_tf",
@ -1378,13 +1376,12 @@ py_test(
 # Tag: utils
 # --------------------------------------------------------------------

-# Skipping due to high flakiness.
-#py_test(
-#    name = "test_curiosity",
-#    tags = ["utils", "flaky"],
-#    size = "large",
-#    srcs = ["utils/exploration/tests/test_curiosity.py"]
-#)
+py_test(
+    name = "test_curiosity",
+    tags = ["utils", "flaky"],
+    size = "large",
+    srcs = ["utils/exploration/tests/test_curiosity.py"]
+)

 py_test(
    name = "test_explorations",
--- a/rllib/agents/ddpg/ddpg.py
+++ b/rllib/agents/ddpg/ddpg.py
@ -5,7 +5,6 @@ from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
 from ray.rllib.agents.ddpg.ddpg_tf_policy import DDPGTFPolicy
 from ray.rllib.policy.policy import Policy
-from ray.rllib.utils.deprecation import DEPRECATED_VALUE
 from ray.rllib.utils.typing import TrainerConfigDict

 logger = logging.getLogger(__name__)
@ -188,11 +187,6 @@ def validate_config(config: TrainerConfigDict) -> None:
                "'complete_episodes'. Setting batch_mode=complete_episodes.")
            config["batch_mode"] = "complete_episodes"

-    if config["simple_optimizer"] != DEPRECATED_VALUE or \
-            config["simple_optimizer"] is False:
-        logger.warning("`simple_optimizer` must be True (or unset) for DDPG!")
-        config["simple_optimizer"] = True
-

 def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    """Policy class picker function. Class is chosen based on DL-framework.
--- a/rllib/agents/sac/sac.py
+++ b/rllib/agents/sac/sac.py
@ -182,11 +182,6 @@ def validate_config(config: TrainerConfigDict) -> None:
    if config["grad_clip"] is not None and config["grad_clip"] <= 0.0:
        raise ValueError("`grad_clip` value must be > 0.0!")

-    if config["simple_optimizer"] != DEPRECATED_VALUE or \
-            config["simple_optimizer"] is False:
-        logger.warning("`simple_optimizer` must be True (or unset) for SAC!")
-        config["simple_optimizer"] = True
-

 def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    """Policy class picker function. Class is chosen based on DL-framework.
--- a/rllib/agents/sac/tests/test_sac.py
+++ b/rllib/agents/sac/tests/test_sac.py
@ -428,9 +428,9 @@ class TestSAC(unittest.TestCase):
                            check(
                                tf_var,
                                np.transpose(torch_var.detach().cpu()),
-                                rtol=0.1)
+                                atol=0.002)
                        else:
-                            check(tf_var, torch_var, rtol=0.1)
+                            check(tf_var, torch_var, atol=0.002)
                    # And alpha.
                    check(policy.model.log_alpha,
                          tf_weights["default_policy/log_alpha"])
@ -445,9 +445,10 @@ class TestSAC(unittest.TestCase):
                            check(
                                tf_var,
                                np.transpose(torch_var.detach().cpu()),
-                                rtol=0.1)
+                                atol=0.002)
                        else:
-                            check(tf_var, torch_var, rtol=0.1)
+                            check(tf_var, torch_var, atol=0.002)
+            trainer.stop()

    def _get_batch_helper(self, obs_size, actions, batch_size):
        return SampleBatch({
--- a/rllib/policy/dynamic_tf_policy.py
+++ b/rllib/policy/dynamic_tf_policy.py
@ -241,7 +241,7 @@ class DynamicTFPolicy(TFPolicy):
                True, (), name="is_exploring")

        # Placeholder for `is_training` flag.
-        self._input_dict["is_training"] = self._get_is_training_placeholder()
+        self._input_dict.is_training = self._get_is_training_placeholder()

        # Multi-GPU towers do not need any action computing/exploration
        # graphs.
@ -266,7 +266,7 @@ class DynamicTFPolicy(TFPolicy):
                    prev_reward_batch=self._input_dict.get(
                        SampleBatch.PREV_REWARDS),
                    explore=explore,
-                    is_training=self._input_dict["is_training"])
+                    is_training=self._input_dict.is_training)
            # Distribution generation is customized, e.g., DQN, DDPG.
            else:
                if action_distribution_fn:
@ -284,7 +284,7 @@ class DynamicTFPolicy(TFPolicy):
                                seq_lens=self._seq_lens,
                                explore=explore,
                                timestep=timestep,
-                                is_training=in_dict["is_training"])
+                                is_training=in_dict.is_training)
                    # Trying the old way (to stay backward compatible).
                    # TODO: Remove in future.
                    except TypeError as e:
@ -301,7 +301,7 @@ class DynamicTFPolicy(TFPolicy):
                                    prev_reward_batch=in_dict.get(
                                        SampleBatch.PREV_REWARDS),
                                    explore=explore,
-                                    is_training=in_dict["is_training"])
+                                    is_training=in_dict.is_training)
                        else:
                            raise e

@ -379,6 +379,9 @@ class DynamicTFPolicy(TFPolicy):
                            self.config.get("num_multi_gpu_tower_stacks", 1))
                    ]

+            # Initialize again after loss and tower init.
+            self.get_session().run(tf1.global_variables_initializer())
+
    @override(TFPolicy)
    @DeveloperAPI
    def copy(self,
@ -693,9 +696,6 @@ class DynamicTFPolicy(TFPolicy):
            if (v not in self._state_inputs and v != self._seq_lens)
        }

-        # Initialize again after loss init.
-        self.get_session().run(tf1.global_variables_initializer())
-
    def _do_loss_init(self, train_batch: SampleBatch):
        loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
        if self._stats_fn:
--- a/rllib/tuned_examples/cql/pendulum-cql.yaml
+++ b/rllib/tuned_examples/cql/pendulum-cql.yaml
@ -6,7 +6,7 @@ pendulum-cql:
    env: Pendulum-v0
    run: CQL
    stop:
-        evaluation/episode_reward_mean: -600
+        evaluation/episode_reward_mean: -700
        timesteps_total: 100000
    config:
        # Works for both torch and tf.
--- a/rllib/tuned_examples/sac/pendulum-sac.yaml
+++ b/rllib/tuned_examples/sac/pendulum-sac.yaml
@ -4,7 +4,7 @@ pendulum-sac:
    env: Pendulum-v0
    run: SAC
    stop:
-        episode_reward_mean: -500
+        episode_reward_mean: -600
        timesteps_total: 10000
    config:
        # Works for both torch and tf.
@ -33,6 +33,6 @@ pendulum-sac:
          entropy_learning_rate: 0.0003
        num_workers: 0
        num_gpus: 0
-        clip_actions: False
+        clip_actions: false
        normalize_actions: true
        metrics_smoothing_episodes: 5
--- a/rllib/utils/exploration/tests/test_curiosity.py
+++ b/rllib/utils/exploration/tests/test_curiosity.py
@ -153,7 +153,7 @@ class TestCuriosity(unittest.TestCase):
        config["lr"] = 0.001

        num_iterations = 10
-        for fw in framework_iterator(config):
+        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            # W/ Curiosity. Expect to learn something.
            config["exploration_config"] = {
                "type": "Curiosity",