[flaky] deflaky test_multi_node_3 (#21069)

`test_multi_node_3` failed because we kill the raylet before the cluster is up which leads the raylet to become a zombie process. This fix wait until the cluster up and kill it.
This commit is contained in:
Yi Cheng 2021-12-14 00:17:01 -08:00 committed by GitHub
parent 3c426ed7b5
commit 613a7cc61d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -7,7 +7,7 @@ import ray
from ray._private.test_utils import (
check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking,
wait_for_children_of_pid, wait_for_children_of_pid_to_exit,
wait_for_children_names_of_pid, kill_process_by_name, Semaphore)
kill_process_by_name, Semaphore)
def test_calling_start_ray_head(call_ray_stop_only):
@ -94,11 +94,21 @@ def test_calling_start_ray_head(call_ray_stop_only):
blocked = subprocess.Popen(
["ray", "start", "--head", "--block", "--port", "0"])
wait_for_children_names_of_pid(blocked.pid, ["raylet"], timeout=30)
blocked.poll()
assert blocked.returncode is None
# Make sure ray cluster is up
run_string_as_driver("""
import ray
from time import sleep
for i in range(0, 5):
try:
ray.init(address='auto')
break
except:
sleep(1)
""")
kill_process_by_name("raylet", SIGKILL=True)
wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
blocked.wait()