[serve] re-enable serve-controller-crash test (#11579)

2025-03-06 18:41:40 -05:00 · 2020-11-02 11:22:09 -08:00 · 2020-11-02 11:22:09 -08:00 · 171e02c684
commit 171e02c684
parent 4a7d0e059d
4 changed files with 22 additions and 15 deletions
--- a/python/ray/serve/BUILD
+++ b/python/ray/serve/BUILD
@ -6,8 +6,7 @@ py_library(
 )

 serve_tests_srcs = glob(["tests/*.py"],
-     exclude=["tests/test_controller_crashes.py",
-              "tests/test_serve.py",
+     exclude=["tests/test_serve.py",
             ])

 py_test(
@ -107,15 +106,14 @@ py_test(


 # Runs test_api and test_failure with injected failures in the controller.
-# TODO(edoakes): reenable this once we're using GCS actor fault tolerance.
-# py_test(
-    # name = "test_controller_crashes",
-    # size = "medium",
-    # srcs = glob(["tests/test_controller_crashes.py",
-                 # "tests/test_api.py",
-                 # "tests/test_failure.py"],
-                # exclude=["tests/test_serve.py"]),
-# )
+py_test(
+    name = "test_controller_crashes",
+    size = "large",
+    srcs = glob(["tests/test_controller_crashes.py",
+                "tests/test_api.py",
+                "tests/test_failure.py"],
+                exclude=["tests/test_serve.py"]),
+)

 py_test(
    name = "echo_full",
--- a/python/ray/serve/controller.py
+++ b/python/ray/serve/controller.py
@ -254,7 +254,8 @@ class ServeController:
        self.kv_store.put(CHECKPOINT_KEY, checkpoint)
        logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))

-        if random.random() < _CRASH_AFTER_CHECKPOINT_PROBABILITY:
+        if random.random(
+        ) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached:
            logger.warning("Intentionally crashing after checkpoint")
            os._exit(0)

--- a/python/ray/serve/tests/conftest.py
+++ b/python/ray/serve/tests/conftest.py
@ -5,7 +5,7 @@ import pytest
 import ray
 from ray import serve

-if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
+if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
    serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5


@ -13,10 +13,14 @@ if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
 def _shared_serve_instance():
    # Uncomment the line below to turn on debug log for tests.
    # os.environ["SERVE_LOG_DEBUG"] = "1"
+    # Overriding task_retry_delay_ms to relaunch actors more quickly
    ray.init(
        num_cpus=36,
        _metrics_export_port=9999,
-        _system_config={"metrics_report_interval_ms": 1000})
+        _system_config={
+            "metrics_report_interval_ms": 1000,
+            "task_retry_delay_ms": 50
+        })
    yield serve.start(detached=True)


--- a/python/ray/serve/tests/test_api.py
+++ b/python/ray/serve/tests/test_api.py
@ -774,7 +774,11 @@ def test_serve_metrics(serve_instance):
    ray.get([block_until_http_ready.remote(url) for _ in range(10)])

    def verify_metrics(do_assert=False):
-        resp = requests.get("http://127.0.0.1:9999").text
+        try:
+            resp = requests.get("http://127.0.0.1:9999").text
+        # Requests will fail if we are crashing the controller
+        except requests.ConnectionError:
+            return False

        expected_metrics = [
            # counter