[ci/release] Fix wait_cluster (#26236)

Fixes a bug in wait_cluster where we count the total number of nodes ever in the cluster rather than the alive nodes. This has causes infra/autoscaler failures (e.g. #26138) to be mislabeled as test failures (and probably messes with timing too). Co-authored-by: Alex Wu <alex@anyscale.com>
2025-03-05 10:01:43 -05:00 · 2022-06-30 16:37:32 -07:00 · 2022-06-30 16:37:32 -07:00 · 76c5122357
commit 76c5122357
parent 010a3566e6
2 changed files with 5 additions and 5 deletions
--- a/release/ray_release/command_runner/_wait_cluster.py
+++ b/release/ray_release/command_runner/_wait_cluster.py
@ -45,7 +45,7 @@ while not curr_nodes >= args.num_nodes:
        next_feedback = now + args.feedback_interval_s

    time.sleep(5)
-    curr_nodes = len(ray.nodes())
+    curr_nodes = sum(1 for node in ray.nodes() if node["Alive"])

 passed = time.time() - start
 print(
--- a/release/ray_release/command_runner/client_runner.py
+++ b/release/ray_release/command_runner/client_runner.py
@ -77,26 +77,26 @@ class ClientRunner(CommandRunner):
            start_time = time.monotonic()
            timeout_at = start_time + timeout
            next_status = start_time + 30
-            nodes_up = len(ray.nodes())
+            nodes_up = sum(1 for node in ray.nodes() if node["Alive"])
            while nodes_up < num_nodes:
                now = time.monotonic()
                if now >= timeout_at:
                    raise ClusterNodesWaitTimeout(
-                        f"Only {len(ray.nodes())}/{num_nodes} are up after "
+                        f"Only {nodes_up}/{num_nodes} are up after "
                        f"{timeout} seconds."
                    )

                if now >= next_status:
                    logger.info(
                        f"Waiting for nodes to come up: "
-                        f"{len(ray.nodes())}/{num_nodes} "
+                        f"{nodes_up}/{num_nodes} "
                        f"({now - start_time:.2f} seconds, "
                        f"timeout: {timeout} seconds)."
                    )
                    next_status += 30

                time.sleep(1)
-                nodes_up = len(ray.nodes())
+                nodes_up = sum(1 for node in ray.nodes() if node["Alive"])

            ray.shutdown()
        except Exception as e: