2018-08-31 11:24:15 -07:00
|
|
|
from __future__ import absolute_import
|
|
|
|
from __future__ import division
|
|
|
|
from __future__ import print_function
|
2017-01-25 22:53:48 -08:00
|
|
|
|
2018-05-13 21:22:01 -07:00
|
|
|
import os
|
2018-08-31 11:24:15 -07:00
|
|
|
import pytest
|
2017-01-25 22:53:48 -08:00
|
|
|
import subprocess
|
|
|
|
import time
|
|
|
|
|
2018-05-24 23:35:25 -07:00
|
|
|
import ray
|
2018-10-02 00:08:47 -07:00
|
|
|
from ray.test.test_utils import (run_and_get_output, run_string_as_driver,
|
|
|
|
run_string_as_driver_nonblocking)
|
2018-09-02 17:34:48 -07:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def ray_start_head():
|
2018-09-02 17:34:48 -07:00
|
|
|
out = run_and_get_output(["ray", "start", "--head", "--num-cpus=2"])
|
2018-08-31 11:24:15 -07:00
|
|
|
# Get the redis address from the output.
|
|
|
|
redis_substring_prefix = "redis_address=\""
|
|
|
|
redis_address_location = (
|
|
|
|
out.find(redis_substring_prefix) + len(redis_substring_prefix))
|
|
|
|
redis_address = out[redis_address_location:]
|
|
|
|
redis_address = redis_address.split("\"")[0]
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
yield redis_address
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Disconnect from the Ray cluster.
|
|
|
|
ray.shutdown()
|
|
|
|
# Kill the Ray cluster.
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_error_isolation(ray_start_head):
|
|
|
|
redis_address = ray_start_head
|
|
|
|
# Connect a driver to the Ray cluster.
|
|
|
|
ray.init(redis_address=redis_address)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# There shouldn't be any errors yet.
|
|
|
|
assert len(ray.error_info()) == 0
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
error_string1 = "error_string1"
|
|
|
|
error_string2 = "error_string2"
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@ray.remote
|
|
|
|
def f():
|
|
|
|
raise Exception(error_string1)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Run a remote function that throws an error.
|
|
|
|
with pytest.raises(Exception):
|
|
|
|
ray.get(f.remote())
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Wait for the error to appear in Redis.
|
|
|
|
while len(ray.error_info()) != 1:
|
|
|
|
time.sleep(0.1)
|
|
|
|
print("Waiting for error to appear.")
|
|
|
|
|
|
|
|
# Make sure we got the error.
|
|
|
|
assert len(ray.error_info()) == 1
|
|
|
|
assert error_string1 in ray.error_info()[0]["message"]
|
|
|
|
|
|
|
|
# Start another driver and make sure that it does not receive this
|
|
|
|
# error. Make the other driver throw an error, and make sure it
|
|
|
|
# receives that error.
|
|
|
|
driver_script = """
|
2017-01-25 22:53:48 -08:00
|
|
|
import ray
|
|
|
|
import time
|
|
|
|
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
|
|
|
|
time.sleep(1)
|
|
|
|
assert len(ray.error_info()) == 0
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def f():
|
2017-07-13 14:53:57 -07:00
|
|
|
raise Exception("{}")
|
2017-01-25 22:53:48 -08:00
|
|
|
|
|
|
|
try:
|
2017-07-13 14:53:57 -07:00
|
|
|
ray.get(f.remote())
|
2017-01-25 22:53:48 -08:00
|
|
|
except Exception as e:
|
2017-07-13 14:53:57 -07:00
|
|
|
pass
|
2017-01-25 22:53:48 -08:00
|
|
|
|
|
|
|
while len(ray.error_info()) != 1:
|
2017-07-13 14:53:57 -07:00
|
|
|
print(len(ray.error_info()))
|
|
|
|
time.sleep(0.1)
|
2017-01-25 22:53:48 -08:00
|
|
|
assert len(ray.error_info()) == 1
|
|
|
|
|
2018-06-09 19:59:15 -07:00
|
|
|
assert "{}" in ray.error_info()[0]["message"]
|
2017-01-25 22:53:48 -08:00
|
|
|
|
|
|
|
print("success")
|
2018-08-31 11:24:15 -07:00
|
|
|
""".format(redis_address, error_string2, error_string2)
|
|
|
|
|
|
|
|
out = run_string_as_driver(driver_script)
|
|
|
|
# Make sure the other driver succeeded.
|
|
|
|
assert "success" in out
|
|
|
|
|
|
|
|
# Make sure that the other error message doesn't show up for this
|
|
|
|
# driver.
|
|
|
|
assert len(ray.error_info()) == 1
|
|
|
|
assert error_string1 in ray.error_info()[0]["message"]
|
2017-01-25 22:53:48 -08:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_remote_function_isolation(ray_start_head):
|
|
|
|
# This test will run multiple remote functions with the same names in
|
|
|
|
# two different drivers. Connect a driver to the Ray cluster.
|
|
|
|
redis_address = ray_start_head
|
2017-01-25 22:53:48 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
ray.init(redis_address=redis_address)
|
2017-02-16 11:30:35 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Start another driver and make sure that it can define and call its
|
|
|
|
# own commands with the same names.
|
|
|
|
driver_script = """
|
2017-02-16 11:30:35 -08:00
|
|
|
import ray
|
|
|
|
import time
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
@ray.remote
|
|
|
|
def f():
|
2017-07-13 14:53:57 -07:00
|
|
|
return 3
|
2017-02-16 11:30:35 -08:00
|
|
|
@ray.remote
|
|
|
|
def g(x, y):
|
2017-07-13 14:53:57 -07:00
|
|
|
return 4
|
2017-02-16 11:30:35 -08:00
|
|
|
for _ in range(10000):
|
2017-07-13 14:53:57 -07:00
|
|
|
result = ray.get([f.remote(), g.remote(0, 0)])
|
|
|
|
assert result == [3, 4]
|
2017-02-16 11:30:35 -08:00
|
|
|
print("success")
|
2018-08-31 11:24:15 -07:00
|
|
|
""".format(redis_address)
|
2017-02-16 11:30:35 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
out = run_string_as_driver(driver_script)
|
2017-02-16 11:30:35 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@ray.remote
|
|
|
|
def f():
|
|
|
|
return 1
|
2017-02-16 11:30:35 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@ray.remote
|
|
|
|
def g(x):
|
|
|
|
return 2
|
2017-02-16 11:30:35 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
for _ in range(10000):
|
|
|
|
result = ray.get([f.remote(), g.remote(0)])
|
|
|
|
assert result == [1, 2]
|
|
|
|
|
|
|
|
# Make sure the other driver succeeded.
|
|
|
|
assert "success" in out
|
2017-02-16 11:30:35 -08:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_driver_exiting_quickly(ray_start_head):
|
|
|
|
# This test will create some drivers that submit some tasks and then
|
|
|
|
# exit without waiting for the tasks to complete.
|
|
|
|
redis_address = ray_start_head
|
2018-01-26 14:29:53 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
ray.init(redis_address=redis_address)
|
|
|
|
|
|
|
|
# Define a driver that creates an actor and exits.
|
|
|
|
driver_script1 = """
|
2018-01-26 14:29:53 -08:00
|
|
|
import ray
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
@ray.remote
|
|
|
|
class Foo(object):
|
|
|
|
def __init__(self):
|
|
|
|
pass
|
|
|
|
Foo.remote()
|
|
|
|
print("success")
|
2018-08-31 11:24:15 -07:00
|
|
|
""".format(redis_address)
|
2018-01-26 14:29:53 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Define a driver that creates some tasks and exits.
|
|
|
|
driver_script2 = """
|
2018-01-26 14:29:53 -08:00
|
|
|
import ray
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
@ray.remote
|
|
|
|
def f():
|
|
|
|
return 1
|
|
|
|
f.remote()
|
|
|
|
print("success")
|
2018-08-31 11:24:15 -07:00
|
|
|
""".format(redis_address)
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Create some drivers and let them exit and make sure everything is
|
|
|
|
# still alive.
|
|
|
|
for _ in range(3):
|
|
|
|
out = run_string_as_driver(driver_script1)
|
|
|
|
# Make sure the first driver ran to completion.
|
|
|
|
assert "success" in out
|
|
|
|
out = run_string_as_driver(driver_script2)
|
|
|
|
# Make sure the first driver ran to completion.
|
|
|
|
assert "success" in out
|
|
|
|
assert ray.services.all_processes_alive()
|
2017-07-13 14:53:57 -07:00
|
|
|
|
|
|
|
|
2018-09-02 17:34:48 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def ray_start_head_with_resources():
|
|
|
|
out = run_and_get_output(
|
|
|
|
["ray", "start", "--head", "--num-cpus=1", "--num-gpus=1"])
|
|
|
|
# Get the redis address from the output.
|
|
|
|
redis_substring_prefix = "redis_address=\""
|
|
|
|
redis_address_location = (
|
|
|
|
out.find(redis_substring_prefix) + len(redis_substring_prefix))
|
|
|
|
redis_address = out[redis_address_location:]
|
|
|
|
redis_address = redis_address.split("\"")[0]
|
|
|
|
|
|
|
|
yield redis_address
|
|
|
|
|
|
|
|
# Kill the Ray cluster.
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
|
2018-09-07 16:11:32 +08:00
|
|
|
@pytest.mark.skipif(
|
|
|
|
os.environ.get("RAY_USE_XRAY") != "1",
|
|
|
|
reason="This test only works with xray.")
|
2018-09-02 17:34:48 -07:00
|
|
|
def test_drivers_release_resources(ray_start_head_with_resources):
|
|
|
|
redis_address = ray_start_head_with_resources
|
|
|
|
|
|
|
|
# Define a driver that creates an actor and exits.
|
|
|
|
driver_script1 = """
|
|
|
|
import time
|
|
|
|
import ray
|
|
|
|
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def f(duration):
|
|
|
|
time.sleep(duration)
|
|
|
|
|
|
|
|
@ray.remote(num_gpus=1)
|
|
|
|
def g(duration):
|
|
|
|
time.sleep(duration)
|
|
|
|
|
|
|
|
@ray.remote(num_gpus=1)
|
|
|
|
class Foo(object):
|
|
|
|
def __init__(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# Make sure some resources are available for us to run tasks.
|
|
|
|
ray.get(f.remote(0))
|
|
|
|
ray.get(g.remote(0))
|
|
|
|
|
|
|
|
# Start a bunch of actors and tasks that use resources. These should all be
|
|
|
|
# cleaned up when this driver exits.
|
|
|
|
foos = [Foo.remote() for _ in range(100)]
|
|
|
|
[f.remote(10 ** 6) for _ in range(100)]
|
|
|
|
|
|
|
|
print("success")
|
|
|
|
""".format(redis_address)
|
|
|
|
|
|
|
|
driver_script2 = (driver_script1 +
|
|
|
|
"import sys\nsys.stdout.flush()\ntime.sleep(10 ** 6)\n")
|
|
|
|
|
2018-09-07 16:11:32 +08:00
|
|
|
def wait_for_success_output(process_handle, timeout=10):
|
2018-09-02 17:34:48 -07:00
|
|
|
# Wait until the process prints "success" and then return.
|
|
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < timeout:
|
|
|
|
output_line = ray.utils.decode(
|
|
|
|
process_handle.stdout.readline()).strip()
|
|
|
|
print(output_line)
|
|
|
|
if output_line == "success":
|
|
|
|
return
|
|
|
|
raise Exception("Timed out waiting for process to print success.")
|
|
|
|
|
|
|
|
# Make sure we can run this driver repeatedly, which means that resources
|
|
|
|
# are getting released in between.
|
|
|
|
for _ in range(5):
|
|
|
|
out = run_string_as_driver(driver_script1)
|
|
|
|
# Make sure the first driver ran to completion.
|
|
|
|
assert "success" in out
|
|
|
|
# Also make sure that this works when the driver exits ungracefully.
|
|
|
|
process_handle = run_string_as_driver_nonblocking(driver_script2)
|
|
|
|
wait_for_success_output(process_handle)
|
|
|
|
# Kill the process ungracefully.
|
|
|
|
process_handle.kill()
|
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_calling_start_ray_head():
|
|
|
|
# Test that we can call start-ray.sh with various command line
|
|
|
|
# parameters. TODO(rkn): This test only tests the --head code path. We
|
|
|
|
# should also test the non-head node code path.
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with no arguments.
|
|
|
|
run_and_get_output(["ray", "start", "--head"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with a number of workers specified.
|
|
|
|
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with a redis port specified.
|
|
|
|
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-07-13 14:53:57 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with a node IP address specified.
|
|
|
|
run_and_get_output(
|
|
|
|
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
# Test starting Ray with an object manager port specified.
|
|
|
|
run_and_get_output(
|
|
|
|
["ray", "start", "--head", "--object-manager-port", "12345"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
# Test starting Ray with the number of CPUs specified.
|
|
|
|
run_and_get_output(["ray", "start", "--head", "--num-cpus", "2"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-11-26 18:25:55 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with the number of GPUs specified.
|
|
|
|
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
# Test starting Ray with the max redis clients specified.
|
|
|
|
run_and_get_output(
|
|
|
|
["ray", "start", "--head", "--redis-max-clients", "100"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
if "RAY_USE_NEW_GCS" not in os.environ:
|
|
|
|
# Test starting Ray with redis shard ports specified.
|
|
|
|
run_and_get_output([
|
|
|
|
"ray", "start", "--head", "--redis-shard-ports", "6380,6381,6382"
|
|
|
|
])
|
2017-07-13 14:53:57 -07:00
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-02-12 12:39:32 -08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
# Test starting Ray with all arguments specified.
|
2018-04-11 10:11:35 -07:00
|
|
|
run_and_get_output([
|
2018-08-31 11:24:15 -07:00
|
|
|
"ray", "start", "--head", "--num-workers", "2", "--redis-port",
|
|
|
|
"6379", "--redis-shard-ports", "6380,6381,6382",
|
|
|
|
"--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus",
|
|
|
|
"0", "--redis-max-clients", "100", "--resources", "{\"Custom\": 1}"
|
2018-04-11 10:11:35 -07:00
|
|
|
])
|
2018-08-31 11:24:15 -07:00
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
|
|
|
|
|
|
|
# Test starting Ray with invalid arguments.
|
|
|
|
with pytest.raises(Exception):
|
|
|
|
run_and_get_output(
|
|
|
|
["ray", "start", "--head", "--redis-address", "127.0.0.1:6379"])
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-11-01 17:13:22 -07:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@pytest.fixture
|
|
|
|
def ray_start_head_local():
|
|
|
|
# Start the Ray processes on this machine.
|
|
|
|
run_and_get_output([
|
|
|
|
"ray", "start", "--head", "--node-ip-address=localhost",
|
|
|
|
"--redis-port=6379"
|
|
|
|
])
|
2017-11-01 17:13:22 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
yield None
|
|
|
|
|
|
|
|
# Disconnect from the Ray cluster.
|
|
|
|
ray.shutdown()
|
|
|
|
# Kill the Ray cluster.
|
|
|
|
subprocess.Popen(["ray", "stop"]).wait()
|
2017-11-01 17:13:22 -07:00
|
|
|
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_using_hostnames(ray_start_head_local):
|
|
|
|
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
|
2017-03-21 12:57:54 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
@ray.remote
|
|
|
|
def f():
|
|
|
|
return 1
|
2018-03-21 11:49:53 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
assert ray.get(f.remote()) == 1
|
2018-03-21 11:49:53 -07:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def ray_start_regular():
|
|
|
|
# Start the Ray processes.
|
|
|
|
address_info = ray.init(num_cpus=1)
|
|
|
|
yield address_info
|
|
|
|
# The code after the yield will run as teardown code.
|
|
|
|
ray.shutdown()
|
|
|
|
|
|
|
|
|
|
|
|
def test_connecting_in_local_case(ray_start_regular):
|
|
|
|
address_info = ray_start_regular
|
|
|
|
|
|
|
|
# Define a driver that just connects to Redis.
|
|
|
|
driver_script = """
|
2018-03-21 11:49:53 -07:00
|
|
|
import ray
|
|
|
|
ray.init(redis_address="{}")
|
|
|
|
print("success")
|
|
|
|
""".format(address_info["redis_address"])
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
out = run_string_as_driver(driver_script)
|
|
|
|
# Make sure the other driver succeeded.
|
|
|
|
assert "success" in out
|
2018-03-21 11:49:53 -07:00
|
|
|
|
2018-07-27 15:57:52 +08:00
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
def test_run_driver_twice(ray_start_regular):
|
|
|
|
# We used to have issue 2165 and 2288:
|
|
|
|
# https://github.com/ray-project/ray/issues/2165
|
|
|
|
# https://github.com/ray-project/ray/issues/2288
|
|
|
|
# both complain that driver will hang when run for the second time.
|
|
|
|
# This test is used to verify the fix for above issue, it will run the
|
|
|
|
# same driver for twice and verify whether both of them succeed.
|
|
|
|
address_info = ray_start_regular
|
|
|
|
driver_script = """
|
2018-07-27 15:57:52 +08:00
|
|
|
import ray
|
|
|
|
import ray.tune as tune
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
def train_func(config, reporter): # add a reporter arg
|
|
|
|
for i in range(2):
|
|
|
|
time.sleep(0.1)
|
|
|
|
reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics
|
|
|
|
|
2018-08-28 16:45:49 -07:00
|
|
|
ray.init(redis_address="{}")
|
2018-07-27 15:57:52 +08:00
|
|
|
ray.tune.register_trainable("train_func", train_func)
|
|
|
|
|
|
|
|
tune.run_experiments({{
|
|
|
|
"my_experiment": {{
|
|
|
|
"run": "train_func",
|
|
|
|
"stop": {{"mean_accuracy": 99}},
|
|
|
|
"config": {{
|
|
|
|
"layer1": {{
|
|
|
|
"class_name": tune.grid_search(["a"]),
|
|
|
|
"config": {{"lr": tune.grid_search([1, 2])}}
|
|
|
|
}},
|
|
|
|
}},
|
|
|
|
"local_dir": os.path.expanduser("~/tmp")
|
|
|
|
}}
|
|
|
|
}})
|
|
|
|
print("success")
|
|
|
|
""".format(address_info["redis_address"])
|
|
|
|
|
2018-08-31 11:24:15 -07:00
|
|
|
for i in range(2):
|
|
|
|
out = run_string_as_driver(driver_script)
|
|
|
|
assert "success" in out
|