mirror of
https://github.com/vale981/ray
synced 2025-03-06 18:41:40 -05:00
[serve] re-enable serve-controller-crash test (#11579)
This commit is contained in:
parent
4a7d0e059d
commit
171e02c684
4 changed files with 22 additions and 15 deletions
|
@ -6,8 +6,7 @@ py_library(
|
|||
)
|
||||
|
||||
serve_tests_srcs = glob(["tests/*.py"],
|
||||
exclude=["tests/test_controller_crashes.py",
|
||||
"tests/test_serve.py",
|
||||
exclude=["tests/test_serve.py",
|
||||
])
|
||||
|
||||
py_test(
|
||||
|
@ -107,15 +106,14 @@ py_test(
|
|||
|
||||
|
||||
# Runs test_api and test_failure with injected failures in the controller.
|
||||
# TODO(edoakes): reenable this once we're using GCS actor fault tolerance.
|
||||
# py_test(
|
||||
# name = "test_controller_crashes",
|
||||
# size = "medium",
|
||||
# srcs = glob(["tests/test_controller_crashes.py",
|
||||
# "tests/test_api.py",
|
||||
# "tests/test_failure.py"],
|
||||
# exclude=["tests/test_serve.py"]),
|
||||
# )
|
||||
py_test(
|
||||
name = "test_controller_crashes",
|
||||
size = "large",
|
||||
srcs = glob(["tests/test_controller_crashes.py",
|
||||
"tests/test_api.py",
|
||||
"tests/test_failure.py"],
|
||||
exclude=["tests/test_serve.py"]),
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "echo_full",
|
||||
|
|
|
@ -254,7 +254,8 @@ class ServeController:
|
|||
self.kv_store.put(CHECKPOINT_KEY, checkpoint)
|
||||
logger.debug("Wrote checkpoint in {:.2f}".format(time.time() - start))
|
||||
|
||||
if random.random() < _CRASH_AFTER_CHECKPOINT_PROBABILITY:
|
||||
if random.random(
|
||||
) < _CRASH_AFTER_CHECKPOINT_PROBABILITY and self.detached:
|
||||
logger.warning("Intentionally crashing after checkpoint")
|
||||
os._exit(0)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
|||
import ray
|
||||
from ray import serve
|
||||
|
||||
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
|
||||
if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False) == 1:
|
||||
serve.controller._CRASH_AFTER_CHECKPOINT_PROBABILITY = 0.5
|
||||
|
||||
|
||||
|
@ -13,10 +13,14 @@ if os.environ.get("RAY_SERVE_INTENTIONALLY_CRASH", False):
|
|||
def _shared_serve_instance():
|
||||
# Uncomment the line below to turn on debug log for tests.
|
||||
# os.environ["SERVE_LOG_DEBUG"] = "1"
|
||||
# Overriding task_retry_delay_ms to relaunch actors more quickly
|
||||
ray.init(
|
||||
num_cpus=36,
|
||||
_metrics_export_port=9999,
|
||||
_system_config={"metrics_report_interval_ms": 1000})
|
||||
_system_config={
|
||||
"metrics_report_interval_ms": 1000,
|
||||
"task_retry_delay_ms": 50
|
||||
})
|
||||
yield serve.start(detached=True)
|
||||
|
||||
|
||||
|
|
|
@ -774,7 +774,11 @@ def test_serve_metrics(serve_instance):
|
|||
ray.get([block_until_http_ready.remote(url) for _ in range(10)])
|
||||
|
||||
def verify_metrics(do_assert=False):
|
||||
resp = requests.get("http://127.0.0.1:9999").text
|
||||
try:
|
||||
resp = requests.get("http://127.0.0.1:9999").text
|
||||
# Requests will fail if we are crashing the controller
|
||||
except requests.ConnectionError:
|
||||
return False
|
||||
|
||||
expected_metrics = [
|
||||
# counter
|
||||
|
|
Loading…
Add table
Reference in a new issue