From 22cab930cdc6749fbc17c2f993b3c42123d8689c Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Sat, 2 May 2020 10:19:44 -0500 Subject: [PATCH] Retry actor failures in serve failure test (#8282) --- ci/long_running_tests/workloads/serve_failure.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ci/long_running_tests/workloads/serve_failure.py b/ci/long_running_tests/workloads/serve_failure.py index c0580c882..5c7047e4f 100644 --- a/ci/long_running_tests/workloads/serve_failure.py +++ b/ci/long_running_tests/workloads/serve_failure.py @@ -6,6 +6,7 @@ import requests import ray from ray import serve +from ray.serve.utils import retry_actor_failures from ray.cluster_utils import Cluster num_redis_shards = 1 @@ -25,7 +26,11 @@ for i in range(num_nodes): redis_max_memory=redis_max_memory, webui_host="0.0.0.0") -ray.init(address=cluster.address, include_webui=True, webui_host="0.0.0.0") +ray.init( + address=cluster.address, + include_webui=True, + webui_host="0.0.0.0", + log_to_driver=False) serve.init(blocking=True) @@ -37,10 +42,11 @@ class RandomKiller: def _get_all_serve_actors(self): master = serve.api._get_master_actor() - [router] = ray.get(master.get_router.remote()) - [http_proxy] = ray.get(master.get_http_proxy.remote()) + [router] = retry_actor_failures(master.get_router) + [http_proxy] = retry_actor_failures(master.get_http_proxy) all_handles = [master, router, http_proxy] - worker_handle_dict = ray.get(master.get_all_worker_handles.remote()) + worker_handle_dict = retry_actor_failures( + master.get_all_worker_handles) for _, replica_dict in worker_handle_dict.items(): all_handles.extend(list(replica_dict.values()))