[serve] re-enable serve-controller-crash test (#11579)

This commit is contained in:
Ian Rodney 2020-11-02 11:22:09 -08:00 committed by GitHub
parent 4a7d0e059d
commit 171e02c684
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 22 additions and 15 deletions

View file

@ -6,8 +6,7 @@ py_library(
)
serve_tests_srcs = glob(["tests/*.py"],
exclude=["tests/test_controller_crashes.py",
"tests/test_serve.py",
exclude=["tests/test_serve.py",
])
py_test(
@ -107,15 +106,14 @@ py_test(
# Runs test_api and test_failure with injected failures in the controller.
# TODO(edoakes): reenable this once we're using GCS actor fault tolerance.
# py_test(
# name = "test_controller_crashes",
# size = "medium",
# srcs = glob(["tests/test_controller_crashes.py",
# "tests/test_api.py",
# "tests/test_failure.py"],
# exclude=["tests/test_serve.py"]),
# )
py_test(
name = "test_controller_crashes",
size = "large",
srcs = glob(["tests/test_controller_crashes.py",
"tests/test_api.py",
"tests/test_failure.py"],
exclude=["tests/test_serve.py"]),
)
py_test(
name = "echo_full",

View file

@ -254,7 +254,8 @@ class ServeController:
self.kv_store.put(CHECKPOINT_KEY, checkpoint)
logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))
if random.random() < _CRASH_AFTER_CHECKPOINT_PROBABILITY:
if random.random(
) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached:
logger.warning("Intentionally crashing after checkpoint")
os._exit(0)

View file

@ -5,7 +5,7 @@ import pytest
import ray
from ray import serve
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5
@ -13,10 +13,14 @@ if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
def _shared_serve_instance():
# Uncomment the line below to turn on debug log for tests.
# os.environ["SERVE_LOG_DEBUG"] = "1"
# Overriding task_retry_delay_ms to relaunch actors more quickly
ray.init(
num_cpus=36,
_metrics_export_port=9999,
_system_config={"metrics_report_interval_ms": 1000})
_system_config={
"metrics_report_interval_ms": 1000,
"task_retry_delay_ms": 50
})
yield serve.start(detached=True)

View file

@ -774,7 +774,11 @@ def test_serve_metrics(serve_instance):
ray.get([block_until_http_ready.remote(url) for _ in range(10)])
def verify_metrics(do_assert=False):
resp = requests.get("http://127.0.0.1:9999").text
try:
resp = requests.get("http://127.0.0.1:9999").text
# Requests will fail if we are crashing the controller
except requests.ConnectionError:
return False
expected_metrics = [
# counter