mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[ci/release] Fix wait_cluster (#26236)
Fixes a bug in wait_cluster where we count the total number of nodes ever in the cluster rather than the alive nodes. This has causes infra/autoscaler failures (e.g. #26138) to be mislabeled as test failures (and probably messes with timing too). Co-authored-by: Alex Wu <alex@anyscale.com>
This commit is contained in:
parent
010a3566e6
commit
76c5122357
2 changed files with 5 additions and 5 deletions
|
@ -45,7 +45,7 @@ while not curr_nodes >= args.num_nodes:
|
|||
next_feedback = now + args.feedback_interval_s
|
||||
|
||||
time.sleep(5)
|
||||
curr_nodes = len(ray.nodes())
|
||||
curr_nodes = sum(1 for node in ray.nodes() if node["Alive"])
|
||||
|
||||
passed = time.time() - start
|
||||
print(
|
||||
|
|
|
@ -77,26 +77,26 @@ class ClientRunner(CommandRunner):
|
|||
start_time = time.monotonic()
|
||||
timeout_at = start_time + timeout
|
||||
next_status = start_time + 30
|
||||
nodes_up = len(ray.nodes())
|
||||
nodes_up = sum(1 for node in ray.nodes() if node["Alive"])
|
||||
while nodes_up < num_nodes:
|
||||
now = time.monotonic()
|
||||
if now >= timeout_at:
|
||||
raise ClusterNodesWaitTimeout(
|
||||
f"Only {len(ray.nodes())}/{num_nodes} are up after "
|
||||
f"Only {nodes_up}/{num_nodes} are up after "
|
||||
f"{timeout} seconds."
|
||||
)
|
||||
|
||||
if now >= next_status:
|
||||
logger.info(
|
||||
f"Waiting for nodes to come up: "
|
||||
f"{len(ray.nodes())}/{num_nodes} "
|
||||
f"{nodes_up}/{num_nodes} "
|
||||
f"({now - start_time:.2f} seconds, "
|
||||
f"timeout: {timeout} seconds)."
|
||||
)
|
||||
next_status += 30
|
||||
|
||||
time.sleep(1)
|
||||
nodes_up = len(ray.nodes())
|
||||
nodes_up = sum(1 for node in ray.nodes() if node["Alive"])
|
||||
|
||||
ray.shutdown()
|
||||
except Exception as e:
|
||||
|
|
Loading…
Add table
Reference in a new issue