[serve] [release tests] Add health check grace period for 1k deployment (#22651)

This commit is contained in:
Archit Kulkarni 2022-02-25 10:13:44 -08:00 committed by GitHub
parent 8548affdc2
commit 31332f8930
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -28,9 +28,11 @@ import click
import json import json
import math import math
import os import os
import time
from ray import serve from ray import serve
from ray.serve.utils import logger from ray.serve.utils import logger
from ray.serve.api import _get_global_client
from serve_test_utils import ( from serve_test_utils import (
aggregate_all_metrics, aggregate_all_metrics,
run_wrk_on_all_nodes, run_wrk_on_all_nodes,
@ -58,7 +60,9 @@ DEFAULT_FULL_TEST_TRIAL_LENGTH = "10m"
def deploy_replicas(num_replicas, max_batch_size): def deploy_replicas(num_replicas, max_batch_size):
@serve.deployment(name="echo", num_replicas=num_replicas) name = "echo"
@serve.deployment(name=name, num_replicas=num_replicas)
class Echo: class Echo:
@serve.batch(max_batch_size=max_batch_size) @serve.batch(max_batch_size=max_batch_size)
async def handle_batch(self, requests): async def handle_batch(self, requests):
@ -67,7 +71,25 @@ def deploy_replicas(num_replicas, max_batch_size):
async def __call__(self, request): async def __call__(self, request):
return await self.handle_batch(request) return await self.handle_batch(request)
Echo.deploy() # Set _blocking=False to allow for a custom extended grace period for the
# health check, which is necessary to prevent this test from being flaky.
Echo.deploy(_blocking=False)
start = time.time()
client = _get_global_client()
# Wait for up to 10 minutes for the deployment to be healthy, allowing
# time for any actors that crashed to restart.
while time.time() - start < 10 * 60:
try:
# Raises RuntimeError if deployment enters the "UNHEALTHY" state.
client._wait_for_deployment_healthy(name)
except RuntimeError:
time.sleep(1)
pass
# If the deployment is still unhealthy at this point, allow RuntimeError
# to be raised and let this test fail.
client._wait_for_deployment_healthy(name)
def save_results(final_result, default_name): def save_results(final_result, default_name):