From 171e02c6848d12a88c1af7a9ea81a19aebc1f3d9 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Mon, 2 Nov 2020 11:22:09 -0800 Subject: [PATCH] [serve] re-enable serve-controller-crash test (#11579) --- python/ray/serve/BUILD | 20 +++++++++----------- python/ray/serve/controller.py | 3 ++- python/ray/serve/tests/conftest.py | 8 ++++++-- python/ray/serve/tests/test_api.py | 6 +++++- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD index 7c4d5b637..f7ff00823 100644 --- a/python/ray/serve/BUILD +++ b/python/ray/serve/BUILD @@ -6,8 +6,7 @@ py_library( ) serve_tests_srcs = glob(["tests/*.py"], - exclude=["tests/test_controller_crashes.py", - "tests/test_serve.py", + exclude=["tests/test_serve.py", ]) py_test( @@ -107,15 +106,14 @@ py_test( # Runs test_api and test_failure with injected failures in the controller. -# TODO(edoakes): reenable this once we're using GCS actor fault tolerance. -# py_test( - # name = "test_controller_crashes", - # size = "medium", - # srcs = glob(["tests/test_controller_crashes.py", - # "tests/test_api.py", - # "tests/test_failure.py"], - # exclude=["tests/test_serve.py"]), -# ) +py_test( + name = "test_controller_crashes", + size = "large", + srcs = glob(["tests/test_controller_crashes.py", + "tests/test_api.py", + "tests/test_failure.py"], + exclude=["tests/test_serve.py"]), +) py_test( name = "echo_full", diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index f38116396..66a3acf9c 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -254,7 +254,8 @@ class ServeController: self.kv_store.put(CHECKPOINT_KEY, checkpoint) logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start)) - if random.random() < _CRASH_AFTER_CHECKPOINT_PROBABILITY: + if random.random( + ) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached: logger.warning("Intentionally crashing after checkpoint") os._exit(0) diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py index be3e39019..0e46c3bbb 100644 --- a/python/ray/serve/tests/conftest.py +++ b/python/ray/serve/tests/conftest.py @@ -5,7 +5,7 @@ import pytest import ray from ray import serve -if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False): +if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1: serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5 @@ -13,10 +13,14 @@ if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False): def _shared_serve_instance(): # Uncomment the line below to turn on debug log for tests. # os.environ["SERVE_LOG_DEBUG"] = "1" + # Overriding task_retry_delay_ms to relaunch actors more quickly ray.init( num_cpus=36, _metrics_export_port=9999, - _system_config={"metrics_report_interval_ms": 1000}) + _system_config={ + "metrics_report_interval_ms": 1000, + "task_retry_delay_ms": 50 + }) yield serve.start(detached=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index da29f1bc3..bad546f53 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -774,7 +774,11 @@ def test_serve_metrics(serve_instance): ray.get([block_until_http_ready.remote(url) for _ in range(10)]) def verify_metrics(do_assert=False): - resp = requests.get("http://127.0.0.1:9999").text + try: + resp = requests.get("http://127.0.0.1:9999").text + # Requests will fail if we are crashing the controller + except requests.ConnectionError: + return False expected_metrics = [ # counter