mirror of
https://github.com/vale981/ray
synced 2025-03-12 06:06:39 -04:00
122 lines
4.2 KiB
Python
122 lines
4.2 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import ray
|
|
import ray.ray_constants as ray_constants
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ray_start_cluster", [{
|
|
"num_cpus": 4,
|
|
"num_nodes": 3,
|
|
"do_init": True
|
|
}],
|
|
indirect=True)
|
|
def test_actor_creation_node_failure(ray_start_cluster):
|
|
# TODO(swang): Refactor test_raylet_failed, etc to reuse the below code.
|
|
cluster = ray_start_cluster
|
|
|
|
@ray.remote
|
|
class Child(object):
|
|
def __init__(self, death_probability):
|
|
self.death_probability = death_probability
|
|
|
|
def get_probability(self):
|
|
return self.death_probability
|
|
|
|
def ping(self):
|
|
# Exit process with some probability.
|
|
exit_chance = np.random.rand()
|
|
if exit_chance < self.death_probability:
|
|
sys.exit(-1)
|
|
|
|
num_children = 25
|
|
# Children actors will die about half the time.
|
|
death_probability = 0.5
|
|
|
|
children = [Child.remote(death_probability) for _ in range(num_children)]
|
|
while len(cluster.list_all_nodes()) > 1:
|
|
for j in range(2):
|
|
# Submit some tasks on the actors. About half of the actors will
|
|
# fail.
|
|
children_out = [child.ping.remote() for child in children]
|
|
# Wait a while for all the tasks to complete. This should trigger
|
|
# reconstruction for any actor creation tasks that were forwarded
|
|
# to nodes that then failed.
|
|
ready, _ = ray.wait(
|
|
children_out, num_returns=len(children_out), timeout=5 * 60.0)
|
|
assert len(ready) == len(children_out)
|
|
|
|
# Replace any actors that died.
|
|
for i, out in enumerate(children_out):
|
|
try:
|
|
ray.get(out)
|
|
except ray.exceptions.RayActorError:
|
|
children[i] = Child.remote(death_probability)
|
|
|
|
children_out = [
|
|
child.get_probability.remote() for child in children
|
|
]
|
|
# Wait for new created actors to finish creation before
|
|
# removing a node. This is needed because right now we don't
|
|
# support reconstructing actors that died in the process of
|
|
# being created.
|
|
ready, _ = ray.wait(
|
|
children_out, num_returns=len(children_out), timeout=5 * 60.0)
|
|
assert len(ready) == len(children_out)
|
|
|
|
# Remove a node. Any actor creation tasks that were forwarded to this
|
|
# node must be reconstructed.
|
|
cluster.remove_node(cluster.list_all_nodes()[-1])
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
|
reason="Hanging with new GCS API.")
|
|
def test_driver_lives_sequential(ray_start_regular):
|
|
ray.worker._global_node.kill_raylet()
|
|
ray.worker._global_node.kill_plasma_store()
|
|
ray.worker._global_node.kill_log_monitor()
|
|
ray.worker._global_node.kill_monitor()
|
|
ray.worker._global_node.kill_raylet_monitor()
|
|
|
|
# If the driver can reach the tearDown method, then it is still alive.
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
os.environ.get("RAY_USE_NEW_GCS") == "on",
|
|
reason="Hanging with new GCS API.")
|
|
def test_driver_lives_parallel(ray_start_regular):
|
|
all_processes = ray.worker._global_node.all_processes
|
|
process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
|
|
all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
|
|
all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
|
|
all_processes[ray_constants.PROCESS_TYPE_MONITOR] +
|
|
all_processes[ray_constants.PROCESS_TYPE_RAYLET_MONITOR])
|
|
assert len(process_infos) == 5
|
|
|
|
# Kill all the components in parallel.
|
|
for process_info in process_infos:
|
|
process_info.process.terminate()
|
|
|
|
time.sleep(0.1)
|
|
for process_info in process_infos:
|
|
process_info.process.kill()
|
|
|
|
for process_info in process_infos:
|
|
process_info.process.wait()
|
|
|
|
# If the driver can reach the tearDown method, then it is still alive.
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import pytest
|
|
sys.exit(pytest.main(["-v", __file__]))
|