[Nightly tests] Improve k8s testing (#23108)

This PR improves broken k8s tests. Use exponential backoff on the unstable HTTP path (getting job status sometimes has broken connection from the server. I couldn't really find the relevant logs to figure out why this is happening, unfortunately). Fix benchmark tests resource leak check. The existing one was broken because the job submission uses 0.001 node IP resource, which means the cluster_resources can never be the same as available resources. I fixed the issue by not checking node IP resources K8s infra doesn't support instances < 8 CPUs. I used m5.2xlarge instead of xlarge. It will increase the cost a bit, but it wouldn't be very big.
2025-03-05 10:01:43 -05:00 · 2022-03-14 19:49:15 +09:00 · 2022-03-14 19:49:15 +09:00 · 2c2d96eeb1
commit 2c2d96eeb1
parent 8823ca48b4
10 changed files with 31 additions and 12 deletions
--- a/benchmarks/distributed/test_many_actors.py
+++ b/benchmarks/distributed/test_many_actors.py
@ -33,7 +33,7 @@ def test_max_actors():


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


 ray.init(address="auto")
--- a/benchmarks/distributed/test_many_pgs.py
+++ b/benchmarks/distributed/test_many_pgs.py
@ -59,7 +59,7 @@ def test_many_placement_groups():


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


 ray.init(address="auto")
--- a/benchmarks/distributed/test_many_tasks.py
+++ b/benchmarks/distributed/test_many_tasks.py
@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


@click.command()
--- a/python/ray/_private/test_utils.py
+++ b/python/ray/_private/test_utils.py
@ -1268,3 +1268,14 @@ def check_spilled_mb(address, spilled=None, restored=None, fallback=None):
        return True

    wait_for_condition(ok, timeout=3, retry_interval_ms=1000)
+
+
+def no_resource_leaks_excluding_node_resources():
+    cluster_resources = ray.cluster_resources()
+    available_resources = ray.available_resources()
+    for r in ray.cluster_resources():
+        if "node" in r:
+            del cluster_resources[r]
+            del available_resources[r]
+
+    return ray.available_resources() == ray.cluster_resources()
--- a/release/benchmarks/distributed/test_many_actors.py
+++ b/release/benchmarks/distributed/test_many_actors.py
@ -35,7 +35,7 @@ def test_max_actors():


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


 ray.init(address="auto")
--- a/release/benchmarks/distributed/test_many_pgs.py
+++ b/release/benchmarks/distributed/test_many_pgs.py
@ -61,7 +61,7 @@ def test_many_placement_groups():


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


 ray.init(address="auto")
--- a/release/benchmarks/distributed/test_many_tasks.py
+++ b/release/benchmarks/distributed/test_many_tasks.py
@ -42,7 +42,7 @@ def test_max_running_tasks(num_tasks):


 def no_resource_leaks():
-    return ray.available_resources() == ray.cluster_resources()
+    return test_utils.no_resource_leaks_excluding_node_resources()


@click.command()
--- a/release/benchmarks/many_nodes.yaml
+++ b/release/benchmarks/many_nodes.yaml
@ -14,7 +14,7 @@ head_node_type:

 worker_node_types:
    - name: small_worker
-      instance_type: m5.xlarge
+      instance_type: m5.2xlarge
      min_workers: 249
      max_workers: 249
      use_spot: false
--- a/release/benchmarks/object_store.yaml
+++ b/release/benchmarks/object_store.yaml
@ -12,7 +12,7 @@ head_node_type:

 worker_node_types:
    - name: worker_node
-      instance_type: m4.xlarge
+      instance_type: m4.2xlarge
      min_workers: 49
      max_workers: 49
      use_spot: false
--- a/release/ray_release/job_manager.py
+++ b/release/ray_release/job_manager.py
@ -10,6 +10,7 @@ from ray_release.logger import logger
 from ray_release.util import ANYSCALE_HOST
 from ray_release.cluster_manager.cluster_manager import ClusterManager
 from ray_release.exception import CommandTimeout
+from ray_release.util import exponential_backoff_retry


 class JobManager:
@ -51,11 +52,18 @@ class JobManager:
        self.start_time[command_id] = time.time()
        return command_id

+    def _get_job_status_with_retry(self, command_id):
+        job_client = self._get_job_client()
+        return exponential_backoff_retry(
+            lambda: job_client.get_job_status(self.job_id_pool[command_id]),
+            retry_exceptions=Exception,
+            initial_retry_delay_s=1,
+            max_retries=3,
+        )
+
    def _wait_job(self, command_id: int, timeout: int):
        from ray.job_submission import JobStatus  # noqa: F811

-        job_client = self._get_job_client()
-
        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30
@ -73,11 +81,11 @@ class JobManager:
                    f"({int(now - start_time)} seconds) ..."
                )
                next_status += 30
-            status = job_client.get_job_status(self.job_id_pool[command_id])
+            status = self._get_job_status_with_retry(command_id)
            if status in {JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED}:
                break
            time.sleep(1)
-        status = job_client.get_job_status(self.job_id_pool[command_id])
+        status = self._get_job_status_with_retry(command_id)
        # TODO(sang): Propagate JobInfo.error_type
        if status == JobStatus.SUCCEEDED:
            retcode = 0