2019-03-08 15:29:39 -08:00
|
|
|
# This workload tests many drivers using the same cluster.
|
2021-06-16 21:37:17 +01:00
|
|
|
import json
|
|
|
|
import os
|
2019-03-08 15:29:39 -08:00
|
|
|
import time
|
2022-01-31 13:26:02 -08:00
|
|
|
import argparse
|
2019-03-08 15:29:39 -08:00
|
|
|
|
|
|
|
import ray
|
2019-12-05 18:29:41 -08:00
|
|
|
from ray.cluster_utils import Cluster
|
2021-08-18 20:56:33 -07:00
|
|
|
from ray._private.test_utils import run_string_as_driver
|
2019-03-08 15:29:39 -08:00
|
|
|
|
2021-06-16 21:37:17 +01:00
|
|
|
|
|
|
|
def update_progress(result):
|
|
|
|
result["last_update"] = time.time()
|
|
|
|
test_output_json = os.environ.get(
|
|
|
|
"TEST_OUTPUT_JSON", "/tmp/release_test_output.json"
|
|
|
|
)
|
|
|
|
with open(test_output_json, "wt") as f:
|
|
|
|
json.dump(result, f)
|
|
|
|
|
|
|
|
|
2019-03-08 15:29:39 -08:00
|
|
|
num_redis_shards = 5
|
|
|
|
redis_max_memory = 10 ** 8
|
|
|
|
object_store_memory = 10 ** 8
|
|
|
|
num_nodes = 4
|
|
|
|
|
|
|
|
message = (
|
|
|
|
"Make sure there is enough memory on this machine to run this "
|
|
|
|
"workload. We divide the system memory by 2 to provide a buffer."
|
|
|
|
)
|
|
|
|
assert (
|
|
|
|
num_nodes * object_store_memory + num_redis_shards * redis_max_memory
|
2021-05-04 23:10:04 +02:00
|
|
|
< ray._private.utils.get_system_memory() / 2
|
|
|
|
), message
|
2019-03-08 15:29:39 -08:00
|
|
|
|
|
|
|
# Simulate a cluster on one machine.
|
|
|
|
|
|
|
|
cluster = Cluster()
|
|
|
|
for i in range(num_nodes):
|
|
|
|
cluster.add_node(
|
|
|
|
redis_port=6379 if i == 0 else None,
|
|
|
|
num_redis_shards=num_redis_shards if i == 0 else None,
|
|
|
|
num_cpus=4,
|
|
|
|
num_gpus=0,
|
|
|
|
resources={str(i): 5},
|
|
|
|
object_store_memory=object_store_memory,
|
2020-01-15 19:47:59 -06:00
|
|
|
redis_max_memory=redis_max_memory,
|
2020-06-19 14:26:22 -07:00
|
|
|
dashboard_host="0.0.0.0",
|
|
|
|
)
|
2019-09-01 16:53:02 -07:00
|
|
|
ray.init(address=cluster.address)
|
2019-03-08 15:29:39 -08:00
|
|
|
|
|
|
|
# Run the workload.
|
|
|
|
|
|
|
|
# Define a driver script that runs a few tasks and actors on each node in the
|
|
|
|
# cluster.
|
|
|
|
driver_script = """
|
|
|
|
import ray
|
|
|
|
|
2019-09-01 16:53:02 -07:00
|
|
|
ray.init(address="{}")
|
2019-03-08 15:29:39 -08:00
|
|
|
|
|
|
|
num_nodes = {}
|
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def f():
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
class Actor(object):
|
|
|
|
def method(self):
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
for _ in range(5):
|
|
|
|
for i in range(num_nodes):
|
|
|
|
assert (ray.get(
|
2022-02-10 08:17:15 +09:00
|
|
|
f._remote(args=[],
|
|
|
|
kwargs={{}},
|
|
|
|
resources={{str(i): 1}})) == 1)
|
|
|
|
actor = Actor._remote(
|
|
|
|
args=[], kwargs={{}}, resources={{str(i): 1}})
|
2019-03-08 15:29:39 -08:00
|
|
|
assert ray.get(actor.method.remote()) == 1
|
|
|
|
|
2022-02-07 14:05:44 -08:00
|
|
|
# Tests datasets doesn't leak workers.
|
|
|
|
ray.data.range(100).map(lambda x: x).take()
|
|
|
|
|
2019-03-08 15:29:39 -08:00
|
|
|
print("success")
|
2019-09-01 16:53:02 -07:00
|
|
|
""".format(
|
|
|
|
cluster.address, num_nodes
|
|
|
|
)
|
2019-03-08 15:29:39 -08:00
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
def run_driver():
|
2022-02-10 08:17:15 +09:00
|
|
|
output = run_string_as_driver(driver_script, encode="utf-8")
|
2019-03-08 15:29:39 -08:00
|
|
|
assert "success" in output
|
|
|
|
|
|
|
|
|
|
|
|
iteration = 0
|
|
|
|
running_ids = [
|
|
|
|
run_driver._remote(args=[], kwargs={}, num_cpus=0, resources={str(i): 0.01})
|
|
|
|
for i in range(num_nodes)
|
|
|
|
]
|
|
|
|
start_time = time.time()
|
|
|
|
previous_time = start_time
|
2022-01-31 13:26:02 -08:00
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(prog="Many Drivers long running tests")
|
|
|
|
parser.add_argument(
|
|
|
|
"--iteration-num", type=int, help="How many iterations to run", required=False
|
|
|
|
)
|
2022-02-08 18:44:07 +09:00
|
|
|
parser.add_argument(
|
|
|
|
"--smoke-test",
|
|
|
|
action="store_true",
|
|
|
|
help="Whether or not the test is smoke test.",
|
|
|
|
default=False,
|
|
|
|
)
|
2022-01-31 13:26:02 -08:00
|
|
|
args = parser.parse_args()
|
2022-02-08 18:44:07 +09:00
|
|
|
|
|
|
|
iteration_num = args.iteration_num
|
|
|
|
if args.smoke_test:
|
|
|
|
iteration_num = 400
|
2019-03-08 15:29:39 -08:00
|
|
|
while True:
|
2022-02-08 18:44:07 +09:00
|
|
|
if iteration_num is not None and iteration_num < iteration:
|
2022-01-31 13:26:02 -08:00
|
|
|
break
|
2019-03-08 15:29:39 -08:00
|
|
|
# Wait for a driver to finish and start a new driver.
|
|
|
|
[ready_id], running_ids = ray.wait(running_ids, num_returns=1)
|
|
|
|
ray.get(ready_id)
|
|
|
|
|
|
|
|
running_ids.append(
|
|
|
|
run_driver._remote(
|
|
|
|
args=[], kwargs={}, num_cpus=0, resources={str(iteration % num_nodes): 0.01}
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2019-03-08 15:29:39 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
new_time = time.time()
|
|
|
|
print(
|
|
|
|
"Iteration {}:\n"
|
|
|
|
" - Iteration time: {}.\n"
|
|
|
|
" - Absolute time: {}.\n"
|
|
|
|
" - Total elapsed time: {}.".format(
|
|
|
|
iteration, new_time - previous_time, new_time, new_time - start_time
|
2022-01-29 18:41:57 -08:00
|
|
|
)
|
2019-03-08 15:29:39 -08:00
|
|
|
)
|
2021-06-16 21:37:17 +01:00
|
|
|
update_progress(
|
|
|
|
{
|
|
|
|
"iteration": iteration,
|
|
|
|
"iteration_time": new_time - previous_time,
|
|
|
|
"absolute_time": new_time,
|
|
|
|
"elapsed_time": new_time - start_time,
|
|
|
|
}
|
|
|
|
)
|
2021-08-05 01:53:01 -07:00
|
|
|
previous_time = new_time
|
|
|
|
iteration += 1
|