ray/test/actor_test.py
Stephanie Wang fcc37021b2
Throw exception for ray.get of an evicted actor object (#3490)
* Add a flag for whether an object has been created before

* Add regression test

* doc

* Share object directory between object and node managers

* Treat evicted actor tasks as failed

* minor

* Check return value

* Fix bug where object locations weren't getting updated on client death

* Fix mac build

* Use RayTaskError
2018-12-14 11:41:27 -08:00

2311 lines
71 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import random
import numpy as np
import os
import pytest
import signal
import sys
import time
import ray
import ray.ray_constants as ray_constants
import ray.test.test_utils
import ray.test.cluster_utils
@pytest.fixture
def ray_start_regular():
# Start the Ray processes.
ray.init(
num_cpus=1,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
}))
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
@pytest.fixture
def shutdown_only():
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
@pytest.fixture
def head_node_cluster():
cluster = ray.test.cluster_utils.Cluster(
initialize_head=True,
connect=True,
head_node_args={
"_internal_config": json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})
})
yield cluster
# The code after the yield will run as teardown code.
ray.shutdown()
cluster.shutdown()
def test_actor_init_error_propagated(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self, error=False):
if error:
raise Exception("oops")
def foo(self):
return "OK"
actor = Actor.remote(error=False)
ray.get(actor.foo.remote())
actor = Actor.remote(error=True)
with pytest.raises(Exception, match=".*oops.*"):
ray.get(actor.foo.remote())
def test_keyword_args(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self, arg0, arg1=1, arg2="a"):
self.arg0 = arg0
self.arg1 = arg1
self.arg2 = arg2
def get_values(self, arg0, arg1=2, arg2="b"):
return self.arg0 + arg0, self.arg1 + arg1, self.arg2 + arg2
actor = Actor.remote(0)
assert ray.get(actor.get_values.remote(1)) == (1, 3, "ab")
actor = Actor.remote(1, 2)
assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, "ab")
actor = Actor.remote(1, 2, "c")
assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, "cd")
actor = Actor.remote(1, arg2="c")
assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 3, "cd")
assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 1,
"cd")
actor = Actor.remote(1, arg2="c", arg1=2)
assert ray.get(actor.get_values.remote(0, arg2="d")) == (1, 4, "cd")
assert ray.get(actor.get_values.remote(0, arg2="d", arg1=0)) == (1, 2,
"cd")
assert ray.get(actor.get_values.remote(arg2="d", arg1=0, arg0=2)) == (3, 2,
"cd")
# Make sure we get an exception if the constructor is called
# incorrectly.
with pytest.raises(Exception):
actor = Actor.remote()
with pytest.raises(Exception):
actor = Actor.remote(0, 1, 2, arg3=3)
with pytest.raises(Exception):
actor = Actor.remote(0, arg0=1)
# Make sure we get an exception if the method is called incorrectly.
actor = Actor.remote(1)
with pytest.raises(Exception):
ray.get(actor.get_values.remote())
def test_variable_number_of_args(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self, arg0, arg1=1, *args):
self.arg0 = arg0
self.arg1 = arg1
self.args = args
def get_values(self, arg0, arg1=2, *args):
return self.arg0 + arg0, self.arg1 + arg1, self.args, args
actor = Actor.remote(0)
assert ray.get(actor.get_values.remote(1)) == (1, 3, (), ())
actor = Actor.remote(1, 2)
assert ray.get(actor.get_values.remote(2, 3)) == (3, 5, (), ())
actor = Actor.remote(1, 2, "c")
assert ray.get(actor.get_values.remote(2, 3, "d")) == (3, 5, ("c", ),
("d", ))
actor = Actor.remote(1, 2, "a", "b", "c", "d")
assert ray.get(actor.get_values.remote(
2, 3, 1, 2, 3, 4)) == (3, 5, ("a", "b", "c", "d"), (1, 2, 3, 4))
@ray.remote
class Actor(object):
def __init__(self, *args):
self.args = args
def get_values(self, *args):
return self.args, args
a = Actor.remote()
assert ray.get(a.get_values.remote()) == ((), ())
a = Actor.remote(1)
assert ray.get(a.get_values.remote(2)) == ((1, ), (2, ))
a = Actor.remote(1, 2)
assert ray.get(a.get_values.remote(3, 4)) == ((1, 2), (3, 4))
def test_no_args(ray_start_regular):
@ray.remote
class Actor(object):
def __init__(self):
pass
def get_values(self):
pass
actor = Actor.remote()
assert ray.get(actor.get_values.remote()) is None
def test_no_constructor(ray_start_regular):
# If no __init__ method is provided, that should not be a problem.
@ray.remote
class Actor(object):
def get_values(self):
pass
actor = Actor.remote()
assert ray.get(actor.get_values.remote()) is None
def test_custom_classes(ray_start_regular):
class Foo(object):
def __init__(self, x):
self.x = x
@ray.remote
class Actor(object):
def __init__(self, f2):
self.f1 = Foo(1)
self.f2 = f2
def get_values1(self):
return self.f1, self.f2
def get_values2(self, f3):
return self.f1, self.f2, f3
actor = Actor.remote(Foo(2))
results1 = ray.get(actor.get_values1.remote())
assert results1[0].x == 1
assert results1[1].x == 2
results2 = ray.get(actor.get_values2.remote(Foo(3)))
assert results2[0].x == 1
assert results2[1].x == 2
assert results2[2].x == 3
def test_caching_actors(shutdown_only):
# Test defining actors before ray.init() has been called.
@ray.remote
class Foo(object):
def __init__(self):
pass
def get_val(self):
return 3
# Check that we can't actually create actors before ray.init() has been
# called.
with pytest.raises(Exception):
f = Foo.remote()
ray.init(num_cpus=1)
f = Foo.remote()
assert ray.get(f.get_val.remote()) == 3
def test_decorator_args(ray_start_regular):
# This is an invalid way of using the actor decorator.
with pytest.raises(Exception):
@ray.remote()
class Actor(object):
def __init__(self):
pass
# This is an invalid way of using the actor decorator.
with pytest.raises(Exception):
@ray.remote(invalid_kwarg=0) # noqa: F811
class Actor(object):
def __init__(self):
pass
# This is an invalid way of using the actor decorator.
with pytest.raises(Exception):
@ray.remote(num_cpus=0, invalid_kwarg=0) # noqa: F811
class Actor(object):
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_cpus=1) # noqa: F811
class Actor(object):
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_gpus=1) # noqa: F811
class Actor(object):
def __init__(self):
pass
# This is a valid way of using the decorator.
@ray.remote(num_cpus=1, num_gpus=1) # noqa: F811
class Actor(object):
def __init__(self):
pass
def test_random_id_generation(ray_start_regular):
@ray.remote
class Foo(object):
def __init__(self):
pass
# Make sure that seeding numpy does not interfere with the generation
# of actor IDs.
np.random.seed(1234)
random.seed(1234)
f1 = Foo.remote()
np.random.seed(1234)
random.seed(1234)
f2 = Foo.remote()
assert f1._ray_actor_id.id() != f2._ray_actor_id.id()
def test_actor_class_name(ray_start_regular):
@ray.remote
class Foo(object):
def __init__(self):
pass
Foo.remote()
r = ray.worker.global_worker.redis_client
actor_keys = r.keys("ActorClass*")
assert len(actor_keys) == 1
actor_class_info = r.hgetall(actor_keys[0])
assert actor_class_info[b"class_name"] == b"Foo"
assert actor_class_info[b"module"] == b"actor_test"
def test_multiple_return_values(ray_start_regular):
@ray.remote
class Foo(object):
def method0(self):
return 1
@ray.method(num_return_vals=1)
def method1(self):
return 1
@ray.method(num_return_vals=2)
def method2(self):
return 1, 2
@ray.method(num_return_vals=3)
def method3(self):
return 1, 2, 3
f = Foo.remote()
id0 = f.method0.remote()
assert ray.get(id0) == 1
id1 = f.method1.remote()
assert ray.get(id1) == 1
id2a, id2b = f.method2.remote()
assert ray.get([id2a, id2b]) == [1, 2]
id3a, id3b, id3c = f.method3.remote()
assert ray.get([id3a, id3b, id3c]) == [1, 2, 3]
def test_define_actor(ray_start_regular):
@ray.remote
class Test(object):
def __init__(self, x):
self.x = x
def f(self, y):
return self.x + y
t = Test.remote(2)
assert ray.get(t.f.remote(1)) == 3
# Make sure that calling an actor method directly raises an exception.
with pytest.raises(Exception):
t.f(1)
def test_actor_deletion(ray_start_regular):
# Make sure that when an actor handles goes out of scope, the actor
# destructor is called.
@ray.remote
class Actor(object):
def getpid(self):
return os.getpid()
a = Actor.remote()
pid = ray.get(a.getpid.remote())
a = None
ray.test.test_utils.wait_for_pid_to_exit(pid)
actors = [Actor.remote() for _ in range(10)]
pids = ray.get([a.getpid.remote() for a in actors])
a = None
actors = None
[ray.test.test_utils.wait_for_pid_to_exit(pid) for pid in pids]
@ray.remote
class Actor(object):
def method(self):
return 1
# Make sure that if we create an actor and call a method on it
# immediately, the actor doesn't get killed before the method is
# called.
assert ray.get(Actor.remote().method.remote()) == 1
def test_actor_deletion_with_gpus(shutdown_only):
ray.init(num_cpus=1, num_gpus=1)
# When an actor that uses a GPU exits, make sure that the GPU resources
# are released.
@ray.remote(num_gpus=1)
class Actor(object):
def getpid(self):
return os.getpid()
for _ in range(5):
# If we can successfully create an actor, that means that enough
# GPU resources are available.
a = Actor.remote()
ray.get(a.getpid.remote())
def test_actor_state(ray_start_regular):
@ray.remote
class Counter(object):
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
def value(self):
return self.value
c1 = Counter.remote()
c1.increase.remote()
assert ray.get(c1.value.remote()) == 1
c2 = Counter.remote()
c2.increase.remote()
c2.increase.remote()
assert ray.get(c2.value.remote()) == 2
def test_actor_class_methods(ray_start_regular):
class Foo(object):
x = 2
@classmethod
def as_remote(cls):
return ray.remote(cls)
@classmethod
def f(cls):
return cls.x
@classmethod
def g(cls, y):
return cls.x + y
def echo(self, value):
return value
a = Foo.as_remote().remote()
assert ray.get(a.echo.remote(2)) == 2
assert ray.get(a.f.remote()) == 2
assert ray.get(a.g.remote(2)) == 4
def test_multiple_actors(ray_start_regular):
@ray.remote
class Counter(object):
def __init__(self, value):
self.value = value
def increase(self):
self.value += 1
return self.value
def reset(self):
self.value = 0
num_actors = 20
num_increases = 50
# Create multiple actors.
actors = [Counter.remote(i) for i in range(num_actors)]
results = []
# Call each actor's method a bunch of times.
for i in range(num_actors):
results += [actors[i].increase.remote() for _ in range(num_increases)]
result_values = ray.get(results)
for i in range(num_actors):
v = result_values[(num_increases * i):(num_increases * (i + 1))]
assert v == list(range(i + 1, num_increases + i + 1))
# Reset the actor values.
[actor.reset.remote() for actor in actors]
# Interweave the method calls on the different actors.
results = []
for j in range(num_increases):
results += [actor.increase.remote() for actor in actors]
result_values = ray.get(results)
for j in range(num_increases):
v = result_values[(num_actors * j):(num_actors * (j + 1))]
assert v == num_actors * [j + 1]
@pytest.fixture
def ray_start_bigger():
# Start the Ray processes.
ray.init(num_cpus=10)
yield None
# The code after the yield will run as teardown code.
ray.shutdown()
def test_remote_function_within_actor(ray_start_bigger):
# Make sure we can use remote funtions within actors.
# Create some values to close over.
val1 = 1
val2 = 2
@ray.remote
def f(x):
return val1 + x
@ray.remote
def g(x):
return ray.get(f.remote(x))
@ray.remote
class Actor(object):
def __init__(self, x):
self.x = x
self.y = val2
self.object_ids = [f.remote(i) for i in range(5)]
self.values2 = ray.get([f.remote(i) for i in range(5)])
def get_values(self):
return self.x, self.y, self.object_ids, self.values2
def f(self):
return [f.remote(i) for i in range(5)]
def g(self):
return ray.get([g.remote(i) for i in range(5)])
def h(self, object_ids):
return ray.get(object_ids)
actor = Actor.remote(1)
values = ray.get(actor.get_values.remote())
assert values[0] == 1
assert values[1] == val2
assert ray.get(values[2]) == list(range(1, 6))
assert values[3] == list(range(1, 6))
assert ray.get(ray.get(actor.f.remote())) == list(range(1, 6))
assert ray.get(actor.g.remote()) == list(range(1, 6))
assert ray.get(actor.h.remote([f.remote(i) for i in range(5)])) == list(
range(1, 6))
def test_define_actor_within_actor(ray_start_bigger):
# Make sure we can use remote funtions within actors.
@ray.remote
class Actor1(object):
def __init__(self, x):
self.x = x
def new_actor(self, z):
@ray.remote
class Actor2(object):
def __init__(self, x):
self.x = x
def get_value(self):
return self.x
self.actor2 = Actor2.remote(z)
def get_values(self, z):
self.new_actor(z)
return self.x, ray.get(self.actor2.get_value.remote())
actor1 = Actor1.remote(3)
assert ray.get(actor1.get_values.remote(5)) == (3, 5)
def test_use_actor_within_actor(ray_start_bigger):
# Make sure we can use actors within actors.
@ray.remote
class Actor1(object):
def __init__(self, x):
self.x = x
def get_val(self):
return self.x
@ray.remote
class Actor2(object):
def __init__(self, x, y):
self.x = x
self.actor1 = Actor1.remote(y)
def get_values(self, z):
return self.x, ray.get(self.actor1.get_val.remote())
actor2 = Actor2.remote(3, 4)
assert ray.get(actor2.get_values.remote(5)) == (3, 4)
def test_define_actor_within_remote_function(ray_start_bigger):
# Make sure we can define and actors within remote funtions.
@ray.remote
def f(x, n):
@ray.remote
class Actor1(object):
def __init__(self, x):
self.x = x
def get_value(self):
return self.x
actor = Actor1.remote(x)
return ray.get([actor.get_value.remote() for _ in range(n)])
assert ray.get(f.remote(3, 1)) == [3]
assert ray.get(
[f.remote(i, 20) for i in range(10)]) == [20 * [i] for i in range(10)]
def test_use_actor_within_remote_function(ray_start_bigger):
# Make sure we can create and use actors within remote funtions.
@ray.remote
class Actor1(object):
def __init__(self, x):
self.x = x
def get_values(self):
return self.x
@ray.remote
def f(x):
actor = Actor1.remote(x)
return ray.get(actor.get_values.remote())
assert ray.get(f.remote(3)) == 3
def test_actor_import_counter(ray_start_bigger):
# This is mostly a test of the export counters to make sure that when
# an actor is imported, all of the necessary remote functions have been
# imported.
# Export a bunch of remote functions.
num_remote_functions = 50
for i in range(num_remote_functions):
@ray.remote
def f():
return i
@ray.remote
def g():
@ray.remote
class Actor(object):
def __init__(self):
# This should use the last version of f.
self.x = ray.get(f.remote())
def get_val(self):
return self.x
actor = Actor.remote()
return ray.get(actor.get_val.remote())
assert ray.get(g.remote()) == num_remote_functions - 1
def test_inherit_actor_from_class(ray_start_regular):
# Make sure we can define an actor by inheriting from a regular class.
# Note that actors cannot inherit from other actors.
class Foo(object):
def __init__(self, x):
self.x = x
def f(self):
return self.x
def g(self, y):
return self.x + y
@ray.remote
class Actor(Foo):
def __init__(self, x):
Foo.__init__(self, x)
def get_value(self):
return self.f()
actor = Actor.remote(1)
assert ray.get(actor.get_value.remote()) == 1
assert ray.get(actor.g.remote(5)) == 6
def test_remote_functions_not_scheduled_on_actors(ray_start_regular):
# Make sure that regular remote functions are not scheduled on actors.
@ray.remote
class Actor(object):
def __init__(self):
pass
def get_id(self):
return ray.worker.global_worker.worker_id
a = Actor.remote()
actor_id = ray.get(a.get_id.remote())
@ray.remote
def f():
return ray.worker.global_worker.worker_id
resulting_ids = ray.get([f.remote() for _ in range(100)])
assert actor_id not in resulting_ids
def test_actors_on_nodes_with_no_cpus(ray_start_regular):
@ray.remote
class Foo(object):
def method(self):
pass
f = Foo.remote()
ready_ids, _ = ray.wait([f.method.remote()], timeout=100)
assert ready_ids == []
def test_actor_load_balancing(shutdown_only):
num_local_schedulers = 3
ray.worker._init(
start_ray_local=True,
num_cpus=1,
num_local_schedulers=num_local_schedulers)
@ray.remote
class Actor1(object):
def __init__(self):
pass
def get_location(self):
return ray.worker.global_worker.plasma_client.store_socket_name
# Create a bunch of actors.
num_actors = 30
num_attempts = 20
minimum_count = 5
# Make sure that actors are spread between the local schedulers.
attempts = 0
while attempts < num_attempts:
actors = [Actor1.remote() for _ in range(num_actors)]
locations = ray.get([actor.get_location.remote() for actor in actors])
names = set(locations)
counts = [locations.count(name) for name in names]
print("Counts are {}.".format(counts))
if (len(names) == num_local_schedulers
and all(count >= minimum_count for count in counts)):
break
attempts += 1
assert attempts < num_attempts
# Make sure we can get the results of a bunch of tasks.
results = []
for _ in range(1000):
index = np.random.randint(num_actors)
results.append(actors[index].get_location.remote())
ray.get(results)
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Failing with new GCS API on Linux.")
def test_actor_gpus(shutdown_only):
num_local_schedulers = 3
num_gpus_per_scheduler = 4
ray.worker._init(
start_ray_local=True,
num_local_schedulers=num_local_schedulers,
num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]),
num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]))
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
# Create one actor per GPU.
actors = [
Actor1.remote()
for _ in range(num_local_schedulers * num_gpus_per_scheduler)
]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_local_schedulers
location_actor_combinations = []
for node_name in node_names:
for gpu_id in range(num_gpus_per_scheduler):
location_actor_combinations.append((node_name, (gpu_id, )))
assert set(locations_and_ids) == set(location_actor_combinations)
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10)
assert ready_ids == []
def test_actor_multiple_gpus(shutdown_only):
num_local_schedulers = 3
num_gpus_per_scheduler = 5
ray.worker._init(
start_ray_local=True,
num_local_schedulers=num_local_schedulers,
num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]),
num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]))
@ray.remote(num_gpus=2)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
# Create some actors.
actors1 = [Actor1.remote() for _ in range(num_local_schedulers * 2)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors1])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == num_local_schedulers
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == 4
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10)
assert ready_ids == []
# We should be able to create more actors that use only a single GPU.
@ray.remote(num_gpus=1)
class Actor2(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
# Create some actors.
actors2 = [Actor2.remote() for _ in range(num_local_schedulers)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors2])
names = {location for location, gpu_id in locations_and_ids}
assert node_names == names
for location, gpu_ids in locations_and_ids:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(gpus_in_use[node_name]) == 5
assert set(gpus_in_use[node_name]) == set(range(5))
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor2.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10)
assert ready_ids == []
def test_actor_different_numbers_of_gpus(shutdown_only):
# Test that we can create actors on two nodes that have different
# numbers of GPUs.
ray.worker._init(
start_ray_local=True,
num_local_schedulers=3,
num_cpus=[10, 10, 10],
num_gpus=[0, 5, 10])
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
# Create some actors.
actors = [Actor1.remote() for _ in range(0 + 5 + 10)]
# Make sure that no two actors are assigned to the same GPU.
locations_and_ids = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
node_names = {location for location, gpu_id in locations_and_ids}
assert len(node_names) == 2
for node_name in node_names:
node_gpu_ids = [
gpu_id for location, gpu_id in locations_and_ids
if location == node_name
]
assert len(node_gpu_ids) in [5, 10]
assert set(node_gpu_ids) == {(i, ) for i in range(len(node_gpu_ids))}
# Creating a new actor should fail because all of the GPUs are being
# used.
a = Actor1.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10)
assert ready_ids == []
def test_actor_multiple_gpus_from_multiple_tasks(shutdown_only):
num_local_schedulers = 5
num_gpus_per_scheduler = 5
ray.worker._init(
start_ray_local=True,
num_local_schedulers=num_local_schedulers,
redirect_output=True,
num_cpus=(num_local_schedulers * [10 * num_gpus_per_scheduler]),
num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]),
_internal_config=json.dumps({
"num_heartbeats_timeout": 1000
}))
@ray.remote
def create_actors(i, n):
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self, i, j):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return ((
ray.worker.global_worker.plasma_client.store_socket_name),
tuple(self.gpu_ids))
def sleep(self):
time.sleep(100)
# Create n actors.
actors = []
for j in range(n):
actors.append(Actor.remote(i, j))
locations = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
# Put each actor to sleep for a long time to prevent them from getting
# terminated.
for actor in actors:
actor.sleep.remote()
return locations
all_locations = ray.get([
create_actors.remote(i, num_gpus_per_scheduler)
for i in range(num_local_schedulers)
])
# Make sure that no two actors are assigned to the same GPU.
node_names = {
location
for locations in all_locations for location, gpu_id in locations
}
assert len(node_names) == num_local_schedulers
# Keep track of which GPU IDs are being used for each location.
gpus_in_use = {node_name: [] for node_name in node_names}
for locations in all_locations:
for location, gpu_ids in locations:
gpus_in_use[location].extend(gpu_ids)
for node_name in node_names:
assert len(set(gpus_in_use[node_name])) == num_gpus_per_scheduler
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
def get_location_and_ids(self):
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
# All the GPUs should be used up now.
a = Actor.remote()
ready_ids, _ = ray.wait([a.get_location_and_ids.remote()], timeout=10)
assert ready_ids == []
@pytest.mark.skipif(
sys.version_info < (3, 0), reason="This test requires Python 3.")
def test_actors_and_tasks_with_gpus(shutdown_only):
num_local_schedulers = 3
num_gpus_per_scheduler = 6
ray.worker._init(
start_ray_local=True,
num_local_schedulers=num_local_schedulers,
num_cpus=num_gpus_per_scheduler,
num_gpus=(num_local_schedulers * [num_gpus_per_scheduler]))
def check_intervals_non_overlapping(list_of_intervals):
for i in range(len(list_of_intervals)):
for j in range(i):
first_interval = list_of_intervals[i]
second_interval = list_of_intervals[j]
# Check that list_of_intervals[i] and list_of_intervals[j]
# don't overlap.
assert first_interval[0] < first_interval[1]
assert second_interval[0] < second_interval[1]
intervals_nonoverlapping = (
first_interval[1] <= second_interval[0]
or second_interval[1] <= first_interval[0])
assert intervals_nonoverlapping, (
"Intervals {} and {} are overlapping.".format(
first_interval, second_interval))
@ray.remote(num_gpus=1)
def f1():
t1 = time.monotonic()
time.sleep(0.2)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
assert gpu_ids[0] in range(num_gpus_per_scheduler)
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(gpu_ids), [t1, t2])
@ray.remote(num_gpus=2)
def f2():
t1 = time.monotonic()
time.sleep(0.2)
t2 = time.monotonic()
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 2
assert gpu_ids[0] in range(num_gpus_per_scheduler)
assert gpu_ids[1] in range(num_gpus_per_scheduler)
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(gpu_ids), [t1, t2])
@ray.remote(num_gpus=1)
class Actor1(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
assert self.gpu_ids[0] in range(num_gpus_per_scheduler)
def get_location_and_ids(self):
assert ray.get_gpu_ids() == self.gpu_ids
return (ray.worker.global_worker.plasma_client.store_socket_name,
tuple(self.gpu_ids))
def locations_to_intervals_for_many_tasks():
# Launch a bunch of GPU tasks.
locations_ids_and_intervals = ray.get([
f1.remote()
for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler)
] + [
f2.remote()
for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler)
] + [
f1.remote()
for _ in range(5 * num_local_schedulers * num_gpus_per_scheduler)
])
locations_to_intervals = collections.defaultdict(lambda: [])
for location, gpu_ids, interval in locations_ids_and_intervals:
for gpu_id in gpu_ids:
locations_to_intervals[(location, gpu_id)].append(interval)
return locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# Make sure that all GPUs were used.
assert (len(locations_to_intervals) == num_local_schedulers *
num_gpus_per_scheduler)
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Create an actor that uses a GPU.
a = Actor1.remote()
actor_location = ray.get(a.get_location_and_ids.remote())
actor_location = (actor_location[0], actor_location[1][0])
# This check makes sure that actor_location is formatted the same way
# that the keys of locations_to_intervals are formatted.
assert actor_location in locations_to_intervals
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# Make sure that all but one of the GPUs were used.
assert (len(locations_to_intervals) ==
num_local_schedulers * num_gpus_per_scheduler - 1)
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Make sure that the actor's GPU was not used.
assert actor_location not in locations_to_intervals
# Create several more actors that use GPUs.
actors = [Actor1.remote() for _ in range(3)]
actor_locations = ray.get(
[actor.get_location_and_ids.remote() for actor in actors])
# Run a bunch of GPU tasks.
locations_to_intervals = locations_to_intervals_for_many_tasks()
# Make sure that all but 11 of the GPUs were used.
assert (len(locations_to_intervals) ==
num_local_schedulers * num_gpus_per_scheduler - 1 - 3)
# For each GPU, verify that the set of tasks that used this specific
# GPU did not overlap in time.
for locations in locations_to_intervals:
check_intervals_non_overlapping(locations_to_intervals[locations])
# Make sure that the GPUs were not used.
assert actor_location not in locations_to_intervals
for location in actor_locations:
assert location not in locations_to_intervals
# Create more actors to fill up all the GPUs.
more_actors = [
Actor1.remote()
for _ in range(num_local_schedulers * num_gpus_per_scheduler - 1 - 3)
]
# Wait for the actors to finish being created.
ray.get([actor.get_location_and_ids.remote() for actor in more_actors])
# Now if we run some GPU tasks, they should not be scheduled.
results = [f1.remote() for _ in range(30)]
ready_ids, remaining_ids = ray.wait(results, timeout=1000)
assert len(ready_ids) == 0
def test_actors_and_tasks_with_gpus_version_two(shutdown_only):
# Create tasks and actors that both use GPUs and make sure that they
# are given different GPUs
ray.init(num_cpus=10, num_gpus=10)
@ray.remote(num_gpus=1)
def f():
time.sleep(4)
gpu_ids = ray.get_gpu_ids()
assert len(gpu_ids) == 1
return gpu_ids[0]
@ray.remote(num_gpus=1)
class Actor(object):
def __init__(self):
self.gpu_ids = ray.get_gpu_ids()
assert len(self.gpu_ids) == 1
def get_gpu_id(self):
assert ray.get_gpu_ids() == self.gpu_ids
return self.gpu_ids[0]
results = []
actors = []
for _ in range(5):
results.append(f.remote())
a = Actor.remote()
results.append(a.get_gpu_id.remote())
# Prevent the actor handle from going out of scope so that its GPU
# resources don't get released.
actors.append(a)
gpu_ids = ray.get(results)
assert set(gpu_ids) == set(range(10))
@pytest.mark.skipif(
sys.version_info < (3, 0), reason="This test requires Python 3.")
def test_actors_and_task_resource_bookkeeping(ray_start_regular):
@ray.remote
class Foo(object):
def __init__(self):
start = time.monotonic()
time.sleep(0.1)
end = time.monotonic()
self.interval = (start, end)
def get_interval(self):
return self.interval
def sleep(self):
start = time.monotonic()
time.sleep(0.01)
end = time.monotonic()
return start, end
# First make sure that we do not have more actor methods running at a
# time than we have CPUs.
actors = [Foo.remote() for _ in range(4)]
interval_ids = []
interval_ids += [actor.get_interval.remote() for actor in actors]
for _ in range(4):
interval_ids += [actor.sleep.remote() for actor in actors]
# Make sure that the intervals don't overlap.
intervals = ray.get(interval_ids)
intervals.sort(key=lambda x: x[0])
for interval1, interval2 in zip(intervals[:-1], intervals[1:]):
assert interval1[0] < interval1[1]
assert interval1[1] < interval2[0]
assert interval2[0] < interval2[1]
def test_blocking_actor_task(shutdown_only):
ray.init(num_cpus=1, num_gpus=1)
@ray.remote(num_gpus=1)
def f():
return 1
@ray.remote
class Foo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure we can execute a blocking actor method even if there is
# only one CPU.
actor = Foo.remote()
ray.get(actor.blocking_method.remote())
@ray.remote(num_cpus=1)
class CPUFoo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that lifetime CPU resources are not released when actors
# block.
actor = CPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1000)
assert ready_ids == []
assert remaining_ids == [x_id]
@ray.remote(num_gpus=1)
class GPUFoo(object):
def __init__(self):
pass
def blocking_method(self):
ray.get(f.remote())
# Make sure that GPU resources are not released when actors block.
actor = GPUFoo.remote()
x_id = actor.blocking_method.remote()
ready_ids, remaining_ids = ray.wait([x_id], timeout=1000)
assert ready_ids == []
assert remaining_ids == [x_id]
def test_exception_raised_when_actor_node_dies(head_node_cluster):
remote_node = head_node_cluster.add_node()
@ray.remote
class Counter(object):
def __init__(self):
self.x = 0
def local_plasma(self):
return ray.worker.global_worker.plasma_client.store_socket_name
def inc(self):
self.x += 1
return self.x
# Create an actor that is not on the local scheduler.
actor = Counter.remote()
while (ray.get(actor.local_plasma.remote()) !=
remote_node.get_plasma_store_name()):
actor = Counter.remote()
# Kill the second node.
head_node_cluster.remove_node(remote_node)
# Submit some new actor tasks both before and after the node failure is
# detected. Make sure that getting the result raises an exception.
for _ in range(10):
# Submit some new actor tasks.
x_ids = [actor.inc.remote() for _ in range(5)]
for x_id in x_ids:
with pytest.raises(ray.worker.RayTaskError):
# There is some small chance that ray.get will actually
# succeed (if the object is transferred before the raylet
# dies).
ray.get(x_id)
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_actor_init_fails(head_node_cluster):
remote_node = head_node_cluster.add_node()
@ray.remote(max_reconstructions=1)
class Counter(object):
def __init__(self):
self.x = 0
def inc(self):
self.x += 1
return self.x
# Create many actors. It should take a while to finish initializing them.
actors = [Counter.remote() for _ in range(100)]
# Allow some time to forward the actor creation tasks to the other node.
time.sleep(0.1)
# Kill the second node.
head_node_cluster.remove_node(remote_node)
# Get all of the results
results = ray.get([actor.inc.remote() for actor in actors])
assert results == [1 for actor in actors]
def test_reconstruction_suppression(head_node_cluster):
num_local_schedulers = 10
worker_nodes = [
head_node_cluster.add_node() for _ in range(num_local_schedulers)
]
@ray.remote(max_reconstructions=1)
class Counter(object):
def __init__(self):
self.x = 0
def inc(self):
self.x += 1
return self.x
@ray.remote
def inc(actor_handle):
return ray.get(actor_handle.inc.remote())
# Make sure all of the actors have started.
actors = [Counter.remote() for _ in range(20)]
ray.get([actor.inc.remote() for actor in actors])
# Kill a node.
head_node_cluster.remove_node(worker_nodes[0])
# Submit several tasks per actor. These should be randomly scheduled to the
# nodes, so that multiple nodes will detect and try to reconstruct the
# actor that died, but only one should succeed.
results = []
for _ in range(10):
results += [inc.remote(actor) for actor in actors]
# Make sure that we can get the results from the reconstructed actor.
results = ray.get(results)
def setup_counter_actor(test_checkpoint=False,
save_exception=False,
resume_exception=False):
ray.worker._init(
start_ray_local=True,
num_local_schedulers=2,
num_cpus=1,
redirect_output=True)
# Only set the checkpoint interval if we're testing with checkpointing.
checkpoint_interval = -1
if test_checkpoint:
checkpoint_interval = 5
@ray.remote(checkpoint_interval=checkpoint_interval)
class Counter(object):
_resume_exception = resume_exception
def __init__(self, save_exception):
self.x = 0
self.num_inc_calls = 0
self.save_exception = save_exception
self.restored = False
def local_plasma(self):
return ray.worker.global_worker.plasma_client.store_socket_name
def inc(self, *xs):
self.x += 1
self.num_inc_calls += 1
return self.x
def get_num_inc_calls(self):
return self.num_inc_calls
def test_restore(self):
# This method will only return True if __ray_restore__ has been
# called.
return self.restored
def __ray_save__(self):
if self.save_exception:
raise Exception("Exception raised in checkpoint save")
return self.x, self.save_exception
def __ray_restore__(self, checkpoint):
if self._resume_exception:
raise Exception("Exception raised in checkpoint resume")
self.x, self.save_exception = checkpoint
self.num_inc_calls = 0
self.restored = True
local_plasma = ray.worker.global_worker.plasma_client.store_socket_name
# Create an actor that is not on the local scheduler.
actor = Counter.remote(save_exception)
while ray.get(actor.local_plasma.remote()) == local_plasma:
actor = Counter.remote(save_exception)
args = [ray.put(0) for _ in range(100)]
ids = [actor.inc.remote(*args[i:]) for i in range(100)]
return actor, ids
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_checkpointing(shutdown_only):
actor, ids = setup_counter_actor(test_checkpoint=True)
# Wait for the last task to finish running.
ray.get(ids[-1])
# Kill the corresponding plasma store to get rid of the cached objects.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor restored from a checkpoint.
assert ray.get(actor.test_restore.remote())
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(actor.inc.remote())
assert x == 101
# Check that the number of inc calls since actor initialization is less
# than the counter value, since the actor initialized from a
# checkpoint.
num_inc_calls = ray.get(actor.get_num_inc_calls.remote())
assert num_inc_calls < x
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_remote_checkpoint(shutdown_only):
actor, ids = setup_counter_actor(test_checkpoint=True)
# Do a remote checkpoint call and wait for it to finish.
ray.get(actor.__ray_checkpoint__.remote())
# Kill the corresponding plasma store to get rid of the cached objects.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor restored from a checkpoint.
assert ray.get(actor.test_restore.remote())
# Check that the number of inc calls since actor initialization is
# exactly zero, since there could not have been another inc call since
# the remote checkpoint.
num_inc_calls = ray.get(actor.get_num_inc_calls.remote())
assert num_inc_calls == 0
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(actor.inc.remote())
assert x == 101
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_lost_checkpoint(shutdown_only):
actor, ids = setup_counter_actor(test_checkpoint=True)
# Wait for the first fraction of tasks to finish running.
ray.get(ids[len(ids) // 10])
# Kill the corresponding plasma store to get rid of the cached objects.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor restored from a checkpoint.
assert ray.get(actor.test_restore.remote())
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(actor.inc.remote())
assert x == 101
# Check that the number of inc calls since actor initialization is less
# than the counter value, since the actor initialized from a
# checkpoint.
num_inc_calls = ray.get(actor.get_num_inc_calls.remote())
assert num_inc_calls < x
assert 5 < num_inc_calls
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_checkpoint_exception(shutdown_only):
actor, ids = setup_counter_actor(test_checkpoint=True, save_exception=True)
# Wait for the last task to finish running.
ray.get(ids[-1])
# Kill the corresponding plasma store to get rid of the cached objects.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(actor.inc.remote())
assert x == 101
# Check that the number of inc calls since actor initialization is
# equal to the counter value, since the actor did not initialize from a
# checkpoint.
num_inc_calls = ray.get(actor.get_num_inc_calls.remote())
assert num_inc_calls == x
# Check that errors were raised when trying to save the checkpoint.
errors = ray.error_info()
assert 0 < len(errors)
for error in errors:
assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_checkpoint_resume_exception(shutdown_only):
actor, ids = setup_counter_actor(
test_checkpoint=True, resume_exception=True)
# Wait for the last task to finish running.
ray.get(ids[-1])
# Kill the corresponding plasma store to get rid of the cached objects.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(actor.inc.remote())
assert x == 101
# Check that the number of inc calls since actor initialization is
# equal to the counter value, since the actor did not initialize from a
# checkpoint.
num_inc_calls = ray.get(actor.get_num_inc_calls.remote())
assert num_inc_calls == x
# Check that an error was raised when trying to resume from the
# checkpoint.
errors = ray.error_info()
assert len(errors) == 1
for error in errors:
assert error["type"] == ray_constants.CHECKPOINT_PUSH_ERROR
@pytest.mark.skip("Fork/join consistency not yet implemented.")
def test_distributed_handle(self):
counter, ids = setup_counter_actor(test_checkpoint=False)
@ray.remote
def fork_many_incs(counter, num_incs):
x = None
for _ in range(num_incs):
x = counter.inc.remote()
# Only call ray.get() on the last task submitted.
return ray.get(x)
# Fork num_iters times.
count = ray.get(ids[-1])
num_incs = 100
num_iters = 10
forks = [
fork_many_incs.remote(counter, num_incs) for _ in range(num_iters)
]
ray.wait(forks, num_returns=len(forks))
count += num_incs * num_iters
# Kill the second plasma store to get rid of the cached objects and
# trigger the corresponding local scheduler to exit.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor did not restore from a checkpoint.
assert not ray.get(counter.test_restore.remote())
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(counter.inc.remote())
assert x == count + 1
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Hanging with new GCS API.")
def test_remote_checkpoint_distributed_handle(shutdown_only):
counter, ids = setup_counter_actor(test_checkpoint=True)
@ray.remote
def fork_many_incs(counter, num_incs):
x = None
for _ in range(num_incs):
x = counter.inc.remote()
# Only call ray.get() on the last task submitted.
return ray.get(x)
# Fork num_iters times.
count = ray.get(ids[-1])
num_incs = 100
num_iters = 10
forks = [
fork_many_incs.remote(counter, num_incs) for _ in range(num_iters)
]
ray.wait(forks, num_returns=len(forks))
ray.wait([counter.__ray_checkpoint__.remote()])
count += num_incs * num_iters
# Kill the second plasma store to get rid of the cached objects and
# trigger the corresponding local scheduler to exit.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor restored from a checkpoint.
assert ray.get(counter.test_restore.remote())
# Check that the number of inc calls since actor initialization is
# exactly zero, since there could not have been another inc call since
# the remote checkpoint.
num_inc_calls = ray.get(counter.get_num_inc_calls.remote())
assert num_inc_calls == 0
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(counter.inc.remote())
assert x == count + 1
@pytest.mark.skip("Fork/join consistency not yet implemented.")
def test_checkpoint_distributed_handle(shutdown_only):
counter, ids = setup_counter_actor(test_checkpoint=True)
@ray.remote
def fork_many_incs(counter, num_incs):
x = None
for _ in range(num_incs):
x = counter.inc.remote()
# Only call ray.get() on the last task submitted.
return ray.get(x)
# Fork num_iters times.
count = ray.get(ids[-1])
num_incs = 100
num_iters = 10
forks = [
fork_many_incs.remote(counter, num_incs) for _ in range(num_iters)
]
ray.wait(forks, num_returns=len(forks))
count += num_incs * num_iters
# Kill the second plasma store to get rid of the cached objects and
# trigger the corresponding local scheduler to exit.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Check that the actor restored from a checkpoint.
assert ray.get(counter.test_restore.remote())
# Check that we can submit another call on the actor and get the
# correct counter result.
x = ray.get(counter.inc.remote())
assert x == count + 1
def _test_nondeterministic_reconstruction(num_forks, num_items_per_fork,
num_forks_to_wait):
ray.worker._init(
start_ray_local=True,
num_local_schedulers=2,
num_cpus=1,
redirect_output=True)
# Make a shared queue.
@ray.remote
class Queue(object):
def __init__(self):
self.queue = []
def local_plasma(self):
return ray.worker.global_worker.plasma_client.store_socket_name
def push(self, item):
self.queue.append(item)
def read(self):
return self.queue
# Schedule the shared queue onto the remote local scheduler.
local_plasma = ray.worker.global_worker.plasma_client.store_socket_name
actor = Queue.remote()
while ray.get(actor.local_plasma.remote()) == local_plasma:
actor = Queue.remote()
# A task that takes in the shared queue and a list of items to enqueue,
# one by one.
@ray.remote
def enqueue(queue, items):
done = None
for item in items:
done = queue.push.remote(item)
# TODO(swang): Return the object ID returned by the last method
# called on the shared queue, so that the caller of enqueue can
# wait for all of the queue methods to complete. This can be
# removed once join consistency is implemented.
return [done]
# Call the enqueue task num_forks times, each with num_items_per_fork
# unique objects to push onto the shared queue.
enqueue_tasks = []
for fork in range(num_forks):
enqueue_tasks.append(
enqueue.remote(actor,
[(fork, i) for i in range(num_items_per_fork)]))
# Wait for the forks to complete their tasks.
enqueue_tasks = ray.get(enqueue_tasks)
enqueue_tasks = [fork_ids[0] for fork_ids in enqueue_tasks]
ray.wait(enqueue_tasks, num_returns=num_forks_to_wait)
# Read the queue to get the initial order of execution.
queue = ray.get(actor.read.remote())
# Kill the second plasma store to get rid of the cached objects and
# trigger the corresponding local scheduler to exit.
process = ray.services.all_processes[
ray.services.PROCESS_TYPE_PLASMA_STORE][1]
process.kill()
process.wait()
# Read the queue again and check for deterministic reconstruction.
ray.get(enqueue_tasks)
reconstructed_queue = ray.get(actor.read.remote())
# Make sure the final queue has all items from all forks.
assert len(reconstructed_queue) == num_forks * num_items_per_fork
# Make sure that the prefix of the final queue matches the queue from
# the initial execution.
assert queue == reconstructed_queue[:len(queue)]
@pytest.mark.skip("This test does not work yet.")
@pytest.mark.skipif(
os.environ.get("RAY_USE_NEW_GCS") == "on",
reason="Currently doesn't work with the new GCS.")
def test_nondeterministic_reconstruction(shutdown_only):
_test_nondeterministic_reconstruction(10, 100, 10)
@pytest.mark.skip("Nondeterministic reconstruction currently not supported "
"when there are concurrent forks that didn't finish "
"initial execution.")
def test_nondeterministic_reconstruction_concurrent_forks(shutdown_only):
_test_nondeterministic_reconstruction(10, 100, 1)
@pytest.fixture
def setup_queue_actor():
ray.init(num_cpus=1)
@ray.remote
class Queue(object):
def __init__(self):
self.queue = []
def enqueue(self, key, item):
self.queue.append((key, item))
def read(self):
return self.queue
yield Queue.remote()
# The code after the yield will run as teardown code.
ray.shutdown()
def test_fork(setup_queue_actor):
queue = setup_queue_actor
@ray.remote
def fork(queue, key, item):
return ray.get(queue.enqueue.remote(key, item))
# Fork num_iters times.
num_iters = 100
ray.get([fork.remote(queue, i, 0) for i in range(num_iters)])
items = ray.get(queue.read.remote())
for i in range(num_iters):
filtered_items = [item[1] for item in items if item[0] == i]
assert filtered_items == list(range(1))
def test_fork_consistency(setup_queue_actor):
queue = setup_queue_actor
@ray.remote
def fork(queue, key, num_items):
x = None
for item in range(num_items):
x = queue.enqueue.remote(key, item)
return ray.get(x)
# Fork num_iters times.
num_forks = 10
num_items_per_fork = 100
ray.get(
[fork.remote(queue, i, num_items_per_fork) for i in range(num_forks)])
items = ray.get(queue.read.remote())
for i in range(num_forks):
filtered_items = [item[1] for item in items if item[0] == i]
assert filtered_items == list(range(num_items_per_fork))
@pytest.mark.skip("Garbage collection for distributed actor handles not "
"implemented.")
def test_garbage_collection(setup_queue_actor):
queue = setup_queue_actor
@ray.remote
def fork(queue):
for i in range(10):
x = queue.enqueue.remote(0, i)
time.sleep(0.1)
return ray.get(x)
x = fork.remote(queue)
ray.get(queue.read.remote())
del queue
print(ray.get(x))
def test_calling_put_on_actor_handle(ray_start_regular):
@ray.remote
class Counter(object):
def __init__(self):
self.x = 0
def inc(self):
self.x += 1
return self.x
@ray.remote
def f():
return Counter.remote()
@ray.remote
def g():
return [Counter.remote()]
# Currently, calling ray.put on an actor handle is allowed, but is
# there a good use case?
counter = Counter.remote()
counter_id = ray.put(counter)
new_counter = ray.get(counter_id)
assert ray.get(new_counter.inc.remote()) == 1
assert ray.get(counter.inc.remote()) == 2
assert ray.get(new_counter.inc.remote()) == 3
with pytest.raises(Exception):
ray.get(f.remote())
# The below test works, but do we want to disallow this usage?
ray.get(g.remote())
def test_pickling_actor_handle(ray_start_regular):
@ray.remote
class Foo(object):
def method(self):
pass
f = Foo.remote()
new_f = ray.worker.pickle.loads(ray.worker.pickle.dumps(f))
# Verify that we can call a method on the unpickled handle. TODO(rkn):
# we should also test this from a different driver.
ray.get(new_f.method.remote())
def test_pickled_actor_handle_call_in_method_twice(ray_start_regular):
@ray.remote
class Actor1(object):
def f(self):
return 1
@ray.remote
class Actor2(object):
def __init__(self, constructor):
self.actor = constructor()
def step(self):
ray.get(self.actor.f.remote())
a = Actor1.remote()
b = Actor2.remote(lambda: a)
ray.get(b.step.remote())
ray.get(b.step.remote())
def test_register_and_get_named_actors(ray_start_regular):
# TODO(heyucongtom): We should test this from another driver.
@ray.remote
class Foo(object):
def __init__(self):
self.x = 0
def method(self):
self.x += 1
return self.x
f1 = Foo.remote()
# Test saving f.
ray.experimental.register_actor("f1", f1)
# Test getting f.
f2 = ray.experimental.get_actor("f1")
assert f1._actor_id == f2._actor_id
# Test same name register shall raise error.
with pytest.raises(ValueError):
ray.experimental.register_actor("f1", f2)
# Test register with wrong object type.
with pytest.raises(TypeError):
ray.experimental.register_actor("f3", 1)
# Test getting a nonexistent actor.
with pytest.raises(ValueError):
ray.experimental.get_actor("nonexistent")
# Test method
assert ray.get(f1.method.remote()) == 1
assert ray.get(f2.method.remote()) == 2
assert ray.get(f1.method.remote()) == 3
assert ray.get(f2.method.remote()) == 4
@pytest.mark.skipif(
sys.version_info < (3, 0),
reason="This test is currently failing on Python 2.7.")
def test_lifetime_and_transient_resources(ray_start_regular):
# This actor acquires resources only when running methods.
@ray.remote
class Actor1(object):
def method(self):
pass
# This actor acquires resources for its lifetime.
@ray.remote(num_cpus=1)
class Actor2(object):
def method(self):
pass
actor1s = [Actor1.remote() for _ in range(10)]
ray.get([a.method.remote() for a in actor1s])
actor2s = [Actor2.remote() for _ in range(2)]
results = [a.method.remote() for a in actor2s]
ready_ids, remaining_ids = ray.wait(
results, num_returns=len(results), timeout=1000)
assert len(ready_ids) == 1
def test_custom_label_placement(shutdown_only):
ray.worker._init(
start_ray_local=True,
num_local_schedulers=2,
num_cpus=2,
resources=[{
"CustomResource1": 2
}, {
"CustomResource2": 2
}])
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor1(object):
def get_location(self):
return ray.worker.global_worker.plasma_client.store_socket_name
@ray.remote(resources={"CustomResource2": 1})
class ResourceActor2(object):
def get_location(self):
return ray.worker.global_worker.plasma_client.store_socket_name
local_plasma = ray.worker.global_worker.plasma_client.store_socket_name
# Create some actors.
actors1 = [ResourceActor1.remote() for _ in range(2)]
actors2 = [ResourceActor2.remote() for _ in range(2)]
locations1 = ray.get([a.get_location.remote() for a in actors1])
locations2 = ray.get([a.get_location.remote() for a in actors2])
for location in locations1:
assert location == local_plasma
for location in locations2:
assert location != local_plasma
def test_creating_more_actors_than_resources(shutdown_only):
ray.init(num_cpus=10, num_gpus=2, resources={"CustomResource1": 1})
@ray.remote(num_gpus=1)
class ResourceActor1(object):
def method(self):
return ray.get_gpu_ids()[0]
@ray.remote(resources={"CustomResource1": 1})
class ResourceActor2(object):
def method(self):
pass
# Make sure the first two actors get created and the third one does
# not.
actor1 = ResourceActor1.remote()
result1 = actor1.method.remote()
ray.wait([result1])
actor2 = ResourceActor1.remote()
result2 = actor2.method.remote()
ray.wait([result2])
actor3 = ResourceActor1.remote()
result3 = actor3.method.remote()
ready_ids, _ = ray.wait([result3], timeout=200)
assert len(ready_ids) == 0
# By deleting actor1, we free up resources to create actor3.
del actor1
results = ray.get([result1, result2, result3])
assert results[0] == results[2]
assert set(results) == {0, 1}
# Make sure that when one actor goes out of scope a new actor is
# created because some resources have been freed up.
results = []
for _ in range(3):
actor = ResourceActor2.remote()
object_id = actor.method.remote()
results.append(object_id)
# Wait for the task to execute. We do this because otherwise it may
# be possible for the __ray_terminate__ task to execute before the
# method.
ray.wait([object_id])
ray.get(results)
def test_actor_eviction(shutdown_only):
@ray.remote
class Actor(object):
def __init__(self):
pass
def create_object(self, size):
return np.random.rand(size)
object_store_memory = 10**8
ray.init(
object_store_memory=object_store_memory,
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200
}))
a = Actor.remote()
# Submit enough methods on the actor so that they exceed the size of the
# object store.
objects = []
num_objects = 20
for _ in range(num_objects):
obj = a.create_object.remote(object_store_memory // num_objects)
objects.append(obj)
# Get each object once to make sure each object gets created.
ray.get(obj)
# Get each object again. At this point, the earlier objects should have
# been evicted.
num_evicted, num_success = 0, 0
for obj in objects:
try:
ray.get(obj)
num_success += 1
except ray.worker.RayTaskError:
num_evicted += 1
# Some objects should have been evicted, and some should still be in the
# object store.
assert num_evicted > 0
assert num_success > 0
def test_actor_reconstruction(ray_start_regular):
"""Test actor reconstruction when actor process is killed."""
@ray.remote(max_reconstructions=1)
class ReconstructableActor(object):
"""An actor that will be reconstructed at most once."""
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
return self.value
def get_pid(self):
return os.getpid()
def kill_actor(actor):
"""Kill actor process."""
pid = ray.get(actor.get_pid.remote())
os.kill(pid, signal.SIGKILL)
time.sleep(1)
actor = ReconstructableActor.remote()
# Call increase 3 times
for _ in range(3):
ray.get(actor.increase.remote())
# kill actor process
kill_actor(actor)
# Call increase again.
# Check that actor is reconstructed and value is 4.
assert ray.get(actor.increase.remote()) == 4
# kill actor process one more time.
kill_actor(actor)
# The actor has exceeded max reconstructions, and this task should fail.
with pytest.raises(ray.worker.RayTaskError):
ray.get(actor.increase.remote())
# Create another actor.
actor = ReconstructableActor.remote()
# Intentionlly exit the actor
actor.__ray_terminate__.remote()
# Check that the actor won't be reconstructed.
with pytest.raises(ray.worker.RayTaskError):
ray.get(actor.increase.remote())
def test_actor_reconstruction_on_node_failure(head_node_cluster):
"""Test actor reconstruction when node dies unexpectedly."""
cluster = head_node_cluster
max_reconstructions = 3
# Add a few nodes to the cluster.
# Use custom resource to make sure the actor is only created on worker
# nodes, not on the head node.
for _ in range(max_reconstructions + 2):
cluster.add_node(
resources={"a": 1},
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
}),
)
def kill_node(object_store_socket):
node_to_remove = None
for node in cluster.worker_nodes:
if object_store_socket == node.get_plasma_store_name():
node_to_remove = node
cluster.remove_node(node_to_remove)
@ray.remote(max_reconstructions=max_reconstructions, resources={"a": 1})
class MyActor(object):
def __init__(self):
self.value = 0
def increase(self):
self.value += 1
return self.value
def get_object_store_socket(self):
return ray.worker.global_worker.plasma_client.store_socket_name
actor = MyActor.remote()
# Call increase 3 times.
for _ in range(3):
ray.get(actor.increase.remote())
for i in range(max_reconstructions):
object_store_socket = ray.get(actor.get_object_store_socket.remote())
# Kill actor's node and the actor should be reconstructed
# on a different node.
kill_node(object_store_socket)
# Call increase again.
# Check that the actor is reconstructed and value is correct.
assert ray.get(actor.increase.remote()) == 4 + i
# Check that the actor is now on a different node.
assert object_store_socket != ray.get(
actor.get_object_store_socket.remote())
# kill the node again.
object_store_socket = ray.get(actor.get_object_store_socket.remote())
kill_node(object_store_socket)
# The actor has exceeded max reconstructions, and this task should fail.
with pytest.raises(ray.worker.RayTaskError):
ray.get(actor.increase.remote())
def test_multiple_actor_reconstruction(head_node_cluster):
# This test can be made more stressful by increasing the numbers below.
# The total number of actors created will be
# num_actors_at_a_time * num_local_schedulers.
num_local_schedulers = 5
num_actors_at_a_time = 3
num_function_calls_at_a_time = 10
worker_nodes = [
head_node_cluster.add_node(
resources={"CPU": 3},
_internal_config=json.dumps({
"initial_reconstruction_timeout_milliseconds": 200,
"num_heartbeats_timeout": 10,
})) for _ in range(num_local_schedulers)
]
@ray.remote(max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION)
class SlowCounter(object):
def __init__(self):
self.x = 0
def inc(self, duration):
time.sleep(duration)
self.x += 1
return self.x
# Create some initial actors.
actors = [SlowCounter.remote() for _ in range(num_actors_at_a_time)]
# Wait for the actors to start up.
time.sleep(1)
# This is a mapping from actor handles to object IDs returned by
# methods on that actor.
result_ids = collections.defaultdict(lambda: [])
# In a loop we are going to create some actors, run some methods, kill
# a local scheduler, and run some more methods.
for node in worker_nodes:
# Create some actors.
actors.extend(
[SlowCounter.remote() for _ in range(num_actors_at_a_time)])
# Run some methods.
for j in range(len(actors)):
actor = actors[j]
for _ in range(num_function_calls_at_a_time):
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
# Kill a node.
head_node_cluster.remove_node(node)
# Run some more methods.
for j in range(len(actors)):
actor = actors[j]
for _ in range(num_function_calls_at_a_time):
result_ids[actor].append(actor.inc.remote(j**2 * 0.000001))
# Get the results and check that they have the correct values.
for _, result_id_list in result_ids.items():
results = list(range(1, len(result_id_list) + 1))
assert ray.get(result_id_list) == results