mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[Nightly tests] Improve k8s testing (#23108)
This PR improves broken k8s tests. Use exponential backoff on the unstable HTTP path (getting job status sometimes has broken connection from the server. I couldn't really find the relevant logs to figure out why this is happening, unfortunately). Fix benchmark tests resource leak check. The existing one was broken because the job submission uses 0.001 node IP resource, which means the cluster_resources can never be the same as available resources. I fixed the issue by not checking node IP resources K8s infra doesn't support instances < 8 CPUs. I used m5.2xlarge instead of xlarge. It will increase the cost a bit, but it wouldn't be very big.
This commit is contained in:
parent
8823ca48b4
commit
2c2d96eeb1
10 changed files with 31 additions and 12 deletions
|
@ -33,7 +33,7 @@ def test_max_actors():
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
ray.init(address="auto")
|
||||
|
|
|
@ -59,7 +59,7 @@ def test_many_placement_groups():
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
ray.init(address="auto")
|
||||
|
|
|
@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
@click.command()
|
||||
|
|
|
@ -1268,3 +1268,14 @@ def check_spilled_mb(address, spilled=None, restored=None, fallback=None):
|
|||
return True
|
||||
|
||||
wait_for_condition(ok, timeout=3, retry_interval_ms=1000)
|
||||
|
||||
|
||||
def no_resource_leaks_excluding_node_resources():
|
||||
cluster_resources = ray.cluster_resources()
|
||||
available_resources = ray.available_resources()
|
||||
for r in ray.cluster_resources():
|
||||
if "node" in r:
|
||||
del cluster_resources[r]
|
||||
del available_resources[r]
|
||||
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
|
|
|
@ -35,7 +35,7 @@ def test_max_actors():
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
ray.init(address="auto")
|
||||
|
|
|
@ -61,7 +61,7 @@ def test_many_placement_groups():
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
ray.init(address="auto")
|
||||
|
|
|
@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):
|
|||
|
||||
|
||||
def no_resource_leaks():
|
||||
return ray.available_resources() == ray.cluster_resources()
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
@click.command()
|
||||
|
|
|
@ -14,7 +14,7 @@ head_node_type:
|
|||
|
||||
worker_node_types:
|
||||
- name: small_worker
|
||||
instance_type: m5.xlarge
|
||||
instance_type: m5.2xlarge
|
||||
min_workers: 249
|
||||
max_workers: 249
|
||||
use_spot: false
|
||||
|
|
|
@ -12,7 +12,7 @@ head_node_type:
|
|||
|
||||
worker_node_types:
|
||||
- name: worker_node
|
||||
instance_type: m4.xlarge
|
||||
instance_type: m4.2xlarge
|
||||
min_workers: 49
|
||||
max_workers: 49
|
||||
use_spot: false
|
||||
|
|
|
@ -10,6 +10,7 @@ from ray_release.logger import logger
|
|||
from ray_release.util import ANYSCALE_HOST
|
||||
from ray_release.cluster_manager.cluster_manager import ClusterManager
|
||||
from ray_release.exception import CommandTimeout
|
||||
from ray_release.util import exponential_backoff_retry
|
||||
|
||||
|
||||
class JobManager:
|
||||
|
@ -51,11 +52,18 @@ class JobManager:
|
|||
self.start_time[command_id] = time.time()
|
||||
return command_id
|
||||
|
||||
def _get_job_status_with_retry(self, command_id):
|
||||
job_client = self._get_job_client()
|
||||
return exponential_backoff_retry(
|
||||
lambda: job_client.get_job_status(self.job_id_pool[command_id]),
|
||||
retry_exceptions=Exception,
|
||||
initial_retry_delay_s=1,
|
||||
max_retries=3,
|
||||
)
|
||||
|
||||
def _wait_job(self, command_id: int, timeout: int):
|
||||
from ray.job_submission import JobStatus # noqa: F811
|
||||
|
||||
job_client = self._get_job_client()
|
||||
|
||||
start_time = time.monotonic()
|
||||
timeout_at = start_time + timeout
|
||||
next_status = start_time + 30
|
||||
|
@ -73,11 +81,11 @@ class JobManager:
|
|||
f"({int(now - start_time)} seconds) ..."
|
||||
)
|
||||
next_status += 30
|
||||
status = job_client.get_job_status(self.job_id_pool[command_id])
|
||||
status = self._get_job_status_with_retry(command_id)
|
||||
if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
|
||||
break
|
||||
time.sleep(1)
|
||||
status = job_client.get_job_status(self.job_id_pool[command_id])
|
||||
status = self._get_job_status_with_retry(command_id)
|
||||
# TODO(sang): Propagate JobInfo.error_type
|
||||
if status == JobStatus.SUCCEEDED:
|
||||
retcode = 0
|
||||
|
|
Loading…
Add table
Reference in a new issue