[Nightly tests] Improve k8s testing (#23108)

This PR improves broken k8s tests.

Use exponential backoff on the unstable HTTP path (getting job status sometimes has broken connection from the server. I couldn't really find the relevant logs to figure out why this is happening, unfortunately).
Fix benchmark tests resource leak check. The existing one was broken because the job submission uses 0.001 node IP resource, which means the cluster_resources can never be the same as available resources. I fixed the issue by not checking node IP resources
K8s infra doesn't support instances < 8 CPUs. I used m5.2xlarge instead of xlarge. It will increase the cost a bit, but it wouldn't be very big.
This commit is contained in:
SangBin Cho 2022-03-14 19:49:15 +09:00 committed by GitHub
parent 8823ca48b4
commit 2c2d96eeb1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 31 additions and 12 deletions

View file

@ -33,7 +33,7 @@ def test_max_actors():
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
ray.init(address="auto")

View file

@ -59,7 +59,7 @@ def test_many_placement_groups():
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
ray.init(address="auto")

View file

@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
@click.command()

View file

@ -1268,3 +1268,14 @@ def check_spilled_mb(address, spilled=None, restored=None, fallback=None):
return True
wait_for_condition(ok, timeout=3, retry_interval_ms=1000)
def no_resource_leaks_excluding_node_resources():
cluster_resources = ray.cluster_resources()
available_resources = ray.available_resources()
for r in ray.cluster_resources():
if "node" in r:
del cluster_resources[r]
del available_resources[r]
return ray.available_resources() == ray.cluster_resources()

View file

@ -35,7 +35,7 @@ def test_max_actors():
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
ray.init(address="auto")

View file

@ -61,7 +61,7 @@ def test_many_placement_groups():
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
ray.init(address="auto")

View file

@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):
def no_resource_leaks():
return ray.available_resources() == ray.cluster_resources()
return test_utils.no_resource_leaks_excluding_node_resources()
@click.command()

View file

@ -14,7 +14,7 @@ head_node_type:
worker_node_types:
- name: small_worker
instance_type: m5.xlarge
instance_type: m5.2xlarge
min_workers: 249
max_workers: 249
use_spot: false

View file

@ -12,7 +12,7 @@ head_node_type:
worker_node_types:
- name: worker_node
instance_type: m4.xlarge
instance_type: m4.2xlarge
min_workers: 49
max_workers: 49
use_spot: false

View file

@ -10,6 +10,7 @@ from ray_release.logger import logger
from ray_release.util import ANYSCALE_HOST
from ray_release.cluster_manager.cluster_manager import ClusterManager
from ray_release.exception import CommandTimeout
from ray_release.util import exponential_backoff_retry
class JobManager:
@ -51,11 +52,18 @@ class JobManager:
self.start_time[command_id] = time.time()
return command_id
def _get_job_status_with_retry(self, command_id):
job_client = self._get_job_client()
return exponential_backoff_retry(
lambda: job_client.get_job_status(self.job_id_pool[command_id]),
retry_exceptions=Exception,
initial_retry_delay_s=1,
max_retries=3,
)
def _wait_job(self, command_id: int, timeout: int):
from ray.job_submission import JobStatus # noqa: F811
job_client = self._get_job_client()
start_time = time.monotonic()
timeout_at = start_time + timeout
next_status = start_time + 30
@ -73,11 +81,11 @@ class JobManager:
f"({int(now - start_time)} seconds) ..."
)
next_status += 30
status = job_client.get_job_status(self.job_id_pool[command_id])
status = self._get_job_status_with_retry(command_id)
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
break
time.sleep(1)
status = job_client.get_job_status(self.job_id_pool[command_id])
status = self._get_job_status_with_retry(command_id)
# TODO(sang): Propagate JobInfo.error_type
if status == JobStatus.SUCCEEDED:
retcode = 0