[ci/release] Fix wait_cluster (#26236)

Fixes a bug in wait_cluster where we count the total number of nodes ever in the cluster rather than the alive nodes. This has causes infra/autoscaler failures (e.g. #26138) to be mislabeled as test failures (and probably messes with timing too).

Co-authored-by: Alex Wu <alex@anyscale.com>
This commit is contained in:
Alex Wu 2022-06-30 16:37:32 -07:00 committed by GitHub
parent 010a3566e6
commit 76c5122357
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 5 deletions

View file

@ -45,7 +45,7 @@ while not curr_nodes >= args.num_nodes:
next_feedback = now + args.feedback_interval_s
time.sleep(5)
curr_nodes = len(ray.nodes())
curr_nodes = sum(1 for node in ray.nodes() if node["Alive"])
passed = time.time() - start
print(

View file

@ -77,26 +77,26 @@ class ClientRunner(CommandRunner):
start_time = time.monotonic()
timeout_at = start_time + timeout
next_status = start_time + 30
nodes_up = len(ray.nodes())
nodes_up = sum(1 for node in ray.nodes() if node["Alive"])
while nodes_up < num_nodes:
now = time.monotonic()
if now >= timeout_at:
raise ClusterNodesWaitTimeout(
f"Only {len(ray.nodes())}/{num_nodes} are up after "
f"Only {nodes_up}/{num_nodes} are up after "
f"{timeout} seconds."
)
if now >= next_status:
logger.info(
f"Waiting for nodes to come up: "
f"{len(ray.nodes())}/{num_nodes} "
f"{nodes_up}/{num_nodes} "
f"({now - start_time:.2f} seconds, "
f"timeout: {timeout} seconds)."
)
next_status += 30
time.sleep(1)
nodes_up = len(ray.nodes())
nodes_up = sum(1 for node in ray.nodes() if node["Alive"])
ray.shutdown()
except Exception as e: