mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00
[flaky] deflaky test_multi_node_3
(#21069)
`test_multi_node_3` failed because we kill the raylet before the cluster is up which leads the raylet to become a zombie process. This fix wait until the cluster up and kill it.
This commit is contained in:
parent
3c426ed7b5
commit
613a7cc61d
1 changed files with 13 additions and 3 deletions
|
@ -7,7 +7,7 @@ import ray
|
|||
from ray._private.test_utils import (
|
||||
check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking,
|
||||
wait_for_children_of_pid, wait_for_children_of_pid_to_exit,
|
||||
wait_for_children_names_of_pid, kill_process_by_name, Semaphore)
|
||||
kill_process_by_name, Semaphore)
|
||||
|
||||
|
||||
def test_calling_start_ray_head(call_ray_stop_only):
|
||||
|
@ -94,11 +94,21 @@ def test_calling_start_ray_head(call_ray_stop_only):
|
|||
blocked = subprocess.Popen(
|
||||
["ray", "start", "--head", "--block", "--port", "0"])
|
||||
|
||||
wait_for_children_names_of_pid(blocked.pid, ["raylet"], timeout=30)
|
||||
|
||||
blocked.poll()
|
||||
assert blocked.returncode is None
|
||||
|
||||
# Make sure ray cluster is up
|
||||
run_string_as_driver("""
|
||||
import ray
|
||||
from time import sleep
|
||||
for i in range(0, 5):
|
||||
try:
|
||||
ray.init(address='auto')
|
||||
break
|
||||
except:
|
||||
sleep(1)
|
||||
""")
|
||||
|
||||
kill_process_by_name("raylet", SIGKILL=True)
|
||||
wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
|
||||
blocked.wait()
|
||||
|
|
Loading…
Add table
Reference in a new issue