mirror of
https://github.com/vale981/ray
synced 2025-03-05 10:01:43 -05:00
[Core][State Observability] Nightly release test for state API (#26610)
* Initial * Correctness test skeleton * Added limit for listing * Updated grpc config * no more waiting * metrics * Updated constant and add test * renamed * actors * actors * actors * dada * actor dead? * Script * correct test name * limit * Added timeout * release test /2 * Merged * format+doc * wip Signed-off-by: rickyyx <ricky@anyscale.com> * revert packag-lock Signed-off-by: rickyyx <rickyx@anyscale.com> * wip * results Signed-off-by: rickyx <rickyx@anyscale.com> Signed-off-by: rickyyx <rickyx@anyscale.com> Signed-off-by: rickyyx <ricky@anyscale.com> Signed-off-by: rickyx <rickyx@anyscale.com> Co-authored-by: rickyyx <ricky@anyscale.com>
This commit is contained in:
parent
0dceddb912
commit
5ea4747448
4 changed files with 985 additions and 0 deletions
294
python/ray/_private/state_api_test_utils.py
Normal file
294
python/ray/_private/state_api_test_utils.py
Normal file
|
@ -0,0 +1,294 @@
|
|||
import asyncio
|
||||
from copy import deepcopy
|
||||
from collections import defaultdict
|
||||
import concurrent.futures
|
||||
from dataclasses import dataclass, field
|
||||
import logging
|
||||
import numpy as np
|
||||
import pprint
|
||||
import time
|
||||
import traceback
|
||||
from typing import Callable, Dict, List, Optional
|
||||
import ray
|
||||
from ray.actor import ActorHandle
|
||||
|
||||
|
||||
@dataclass
|
||||
class StateAPIMetric:
|
||||
latency_sec: float
|
||||
result_size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class StateAPICallSpec:
|
||||
api: Callable
|
||||
verify_cb: Callable
|
||||
kwargs: Dict = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StateAPIStats:
|
||||
pending_calls: int = 0
|
||||
total_calls: int = 0
|
||||
calls: Dict = field(default_factory=lambda: defaultdict(list))
|
||||
|
||||
|
||||
GLOBAL_STATE_STATS = StateAPIStats()
|
||||
|
||||
STATE_LIST_LIMIT = int(1e6) # 1m
|
||||
STATE_LIST_TIMEOUT = 600 # 10min
|
||||
|
||||
|
||||
def invoke_state_api(
|
||||
verify_cb: Callable,
|
||||
state_api_fn: Callable,
|
||||
state_stats: StateAPIStats = GLOBAL_STATE_STATS,
|
||||
key_suffix: Optional[str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Invoke a State API
|
||||
|
||||
Args:
|
||||
- verify_cb: Callback that takes in the response from `state_api_fn` and
|
||||
returns a boolean, indicating the correctness of the results.
|
||||
- state_api_fn: Function of the state API
|
||||
- state_stats: Stats
|
||||
- kwargs: Keyword arguments to be forwarded to the `state_api_fn`
|
||||
"""
|
||||
if "timeout" not in kwargs:
|
||||
kwargs["timeout"] = STATE_LIST_TIMEOUT
|
||||
|
||||
# Suppress missing output warning
|
||||
kwargs["raise_on_missing_output"] = False
|
||||
|
||||
res = None
|
||||
try:
|
||||
state_stats.total_calls += 1
|
||||
state_stats.pending_calls += 1
|
||||
|
||||
t_start = time.perf_counter()
|
||||
res = state_api_fn(**kwargs)
|
||||
t_end = time.perf_counter()
|
||||
|
||||
metric = StateAPIMetric(t_end - t_start, len(res))
|
||||
if key_suffix:
|
||||
key = f"{state_api_fn.__name__}_{key_suffix}"
|
||||
else:
|
||||
key = state_api_fn.__name__
|
||||
state_stats.calls[key].append(metric)
|
||||
assert verify_cb(res), f"Calling State API failed. len(res)=({len(res)}): {res}"
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
assert (
|
||||
False
|
||||
), f"Calling {state_api_fn.__name__}({kwargs}) failed with {repr(e)}."
|
||||
finally:
|
||||
state_stats.pending_calls -= 1
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def aggregate_perf_results(state_stats: StateAPIStats = GLOBAL_STATE_STATS):
|
||||
"""Aggregate stats of state API calls
|
||||
|
||||
Return:
|
||||
This returns a dict of below fields:
|
||||
- max_{api_key_name}_latency_sec:
|
||||
Max latency of call to {api_key_name}
|
||||
- {api_key_name}_result_size_with_max_latency:
|
||||
The size of the result (or the number of bytes for get_log API)
|
||||
for the max latency invocation
|
||||
- avg/p99/p95/p50_{api_key_name}_latency_sec:
|
||||
The percentile latency stats
|
||||
- avg_state_api_latency_sec:
|
||||
The average latency of all the state apis tracked
|
||||
"""
|
||||
# Prevent iteration when modifying error
|
||||
state_stats = deepcopy(state_stats)
|
||||
perf_result = {}
|
||||
for api_key_name, metrics in state_stats.calls.items():
|
||||
# Per api aggregation
|
||||
# Max latency
|
||||
latency_key = f"max_{api_key_name}_latency_sec"
|
||||
size_key = f"{api_key_name}_result_size_with_max_latency"
|
||||
metric = max(metrics, key=lambda metric: metric.latency_sec)
|
||||
|
||||
perf_result[latency_key] = metric.latency_sec
|
||||
perf_result[size_key] = metric.result_size
|
||||
|
||||
latency_list = np.array([metric.latency_sec for metric in metrics])
|
||||
# avg latency
|
||||
key = f"avg_{api_key_name}_latency_sec"
|
||||
perf_result[key] = np.average(latency_list)
|
||||
|
||||
# p99 latency
|
||||
key = f"p99_{api_key_name}_latency_sec"
|
||||
perf_result[key] = np.percentile(latency_list, 99)
|
||||
|
||||
# p95 latency
|
||||
key = f"p95_{api_key_name}_latency_sec"
|
||||
perf_result[key] = np.percentile(latency_list, 95)
|
||||
|
||||
# p50 latency
|
||||
key = f"p50_{api_key_name}_latency_sec"
|
||||
perf_result[key] = np.percentile(latency_list, 50)
|
||||
|
||||
all_state_api_latency = sum(
|
||||
metric.latency_sec
|
||||
for metric_samples in state_stats.calls.values()
|
||||
for metric in metric_samples
|
||||
)
|
||||
|
||||
perf_result["avg_state_api_latency_sec"] = (
|
||||
(all_state_api_latency / state_stats.total_calls)
|
||||
if state_stats.total_calls != 0
|
||||
else -1
|
||||
)
|
||||
|
||||
return perf_result
|
||||
|
||||
|
||||
@ray.remote
|
||||
class StateAPIGeneratorActor:
|
||||
def __init__(
|
||||
self,
|
||||
apis: List[StateAPICallSpec],
|
||||
call_interval_s: float = 5.0,
|
||||
print_interval_s: float = 20.0,
|
||||
wait_after_stop: bool = True,
|
||||
) -> None:
|
||||
"""An actor that periodically issues state API
|
||||
|
||||
Args:
|
||||
- apis: List of StateAPICallSpec
|
||||
- call_interval_s: State apis in the `apis` will be issued
|
||||
every `call_interval_s` seconds.
|
||||
- print_interval_s: How frequent state api stats will be dumped.
|
||||
- wait_after_stop: When true, call to `ray.get(actor.stop.remote())`
|
||||
will wait for all pending state APIs to return.
|
||||
Setting it to `False` might miss some long-running state apis calls.
|
||||
"""
|
||||
# Configs
|
||||
self._apis = apis
|
||||
self._call_interval_s = call_interval_s
|
||||
self._print_interval_s = print_interval_s
|
||||
self._wait_after_cancel = wait_after_stop
|
||||
self._logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
# States
|
||||
self._tasks = None
|
||||
self._fut_queue = None
|
||||
self._executor = None
|
||||
self._loop = None
|
||||
self._stopping = False
|
||||
self._stopped = False
|
||||
self._stats = StateAPIStats()
|
||||
|
||||
async def start(self):
|
||||
# Run the periodic api generator
|
||||
self._fut_queue = asyncio.Queue()
|
||||
self._executor = concurrent.futures.ThreadPoolExecutor()
|
||||
|
||||
self._tasks = [
|
||||
asyncio.ensure_future(awt)
|
||||
for awt in [
|
||||
self._run_generator(),
|
||||
self._run_result_waiter(),
|
||||
self._run_stats_reporter(),
|
||||
]
|
||||
]
|
||||
await asyncio.gather(*self._tasks)
|
||||
|
||||
def call(self, fn, verify_cb, **kwargs):
|
||||
def run_fn():
|
||||
try:
|
||||
self._logger.debug(f"calling {fn.__name__}({kwargs})")
|
||||
return invoke_state_api(
|
||||
verify_cb, fn, state_stats=self._stats, **kwargs
|
||||
)
|
||||
except Exception as e:
|
||||
self._logger.warning(f"{fn.__name__}({kwargs}) failed with: {repr(e)}")
|
||||
return None
|
||||
|
||||
fut = asyncio.get_running_loop().run_in_executor(self._executor, run_fn)
|
||||
return fut
|
||||
|
||||
async def _run_stats_reporter(self):
|
||||
while not self._stopped:
|
||||
# Keep the reporter running until all pending apis finish and the bool
|
||||
# `self._stopped` is then True
|
||||
self._logger.info(pprint.pprint(aggregate_perf_results(self._stats)))
|
||||
try:
|
||||
await asyncio.sleep(self._print_interval_s)
|
||||
except asyncio.CancelledError:
|
||||
self._logger.info(
|
||||
"_run_stats_reporter cancelled, "
|
||||
f"waiting for all api {self._stats.pending_calls}calls to return..."
|
||||
)
|
||||
|
||||
async def _run_generator(self):
|
||||
try:
|
||||
while not self._stopping:
|
||||
# Run the state API in another thread
|
||||
for api_spec in self._apis:
|
||||
fut = self.call(api_spec.api, api_spec.verify_cb, **api_spec.kwargs)
|
||||
self._fut_queue.put_nowait(fut)
|
||||
|
||||
await asyncio.sleep(self._call_interval_s)
|
||||
except asyncio.CancelledError:
|
||||
# Stop running
|
||||
self._logger.info("_run_generator cancelled, now stopping...")
|
||||
return
|
||||
|
||||
async def _run_result_waiter(self):
|
||||
try:
|
||||
while not self._stopping:
|
||||
fut = await self._fut_queue.get()
|
||||
await fut
|
||||
except asyncio.CancelledError:
|
||||
self._logger.info(
|
||||
f"_run_result_waiter cancelled, cancelling {self._fut_queue.qsize()} "
|
||||
"pending futures..."
|
||||
)
|
||||
while not self._fut_queue.empty():
|
||||
fut = self._fut_queue.get_nowait()
|
||||
if self._wait_after_cancel:
|
||||
await fut
|
||||
else:
|
||||
# Ignore the queue futures if we are not
|
||||
# waiting on them after stop() called
|
||||
fut.cancel()
|
||||
return
|
||||
|
||||
def get_stats(self):
|
||||
# deep copy to prevent race between reporting and modifying stats
|
||||
return aggregate_perf_results(self._stats)
|
||||
|
||||
def ready(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self._stopping = True
|
||||
self._logger.debug(f"calling stop, canceling {len(self._tasks)} tasks")
|
||||
for task in self._tasks:
|
||||
task.cancel()
|
||||
|
||||
# This will block the stop() function until all futures are cancelled
|
||||
# if _wait_after_cancel=True. When _wait_after_cancel=False, it will still
|
||||
# wait for any in-progress futures.
|
||||
# See: https://docs.python.org/3.8/library/concurrent.futures.html
|
||||
self._executor.shutdown(wait=self._wait_after_cancel)
|
||||
self._stopped = True
|
||||
|
||||
|
||||
def periodic_invoke_state_apis_with_actor(*args, **kwargs) -> ActorHandle:
|
||||
current_node_ip = ray._private.worker.global_worker.node_ip_address
|
||||
# Schedule the actor on the current node.
|
||||
actor = StateAPIGeneratorActor.options(
|
||||
resources={f"node:{current_node_ip}": 0.001}
|
||||
).remote(*args, **kwargs)
|
||||
print("Waiting for state api actor to be ready...")
|
||||
ray.get(actor.ready.remote())
|
||||
print("State api actor is ready now.")
|
||||
actor.start.remote()
|
||||
return actor
|
423
release/nightly_tests/stress_tests/test_state_api_scale.py
Normal file
423
release/nightly_tests/stress_tests/test_state_api_scale.py
Normal file
|
@ -0,0 +1,423 @@
|
|||
import click
|
||||
import json
|
||||
import ray
|
||||
from ray._private.ray_constants import LOG_PREFIX_ACTOR_NAME
|
||||
from ray._private.state_api_test_utils import (
|
||||
STATE_LIST_LIMIT,
|
||||
StateAPIMetric,
|
||||
aggregate_perf_results,
|
||||
invoke_state_api,
|
||||
GLOBAL_STATE_STATS,
|
||||
)
|
||||
|
||||
import ray._private.test_utils as test_utils
|
||||
import tqdm
|
||||
import asyncio
|
||||
import time
|
||||
import os
|
||||
|
||||
from ray.experimental.state.api import (
|
||||
get_log,
|
||||
list_actors,
|
||||
list_objects,
|
||||
list_tasks,
|
||||
)
|
||||
|
||||
GiB = 1024 * 1024 * 1024
|
||||
MiB = 1024 * 1024
|
||||
|
||||
|
||||
# We set num_cpus to zero because this actor will mostly just block on I/O.
|
||||
@ray.remote(num_cpus=0)
|
||||
class SignalActor:
|
||||
def __init__(self):
|
||||
self.ready_event = asyncio.Event()
|
||||
|
||||
def send(self, clear=False):
|
||||
self.ready_event.set()
|
||||
if clear:
|
||||
self.ready_event.clear()
|
||||
|
||||
async def wait(self, should_wait=True):
|
||||
if should_wait:
|
||||
await self.ready_event.wait()
|
||||
|
||||
|
||||
def invoke_state_api_n(*args, **kwargs):
|
||||
NUM_API_CALL_SAMPLES = 10
|
||||
for _ in range(NUM_API_CALL_SAMPLES):
|
||||
invoke_state_api(*args, **kwargs)
|
||||
|
||||
|
||||
def test_many_tasks(num_tasks: int):
|
||||
if num_tasks == 0:
|
||||
print("Skipping test with no tasks")
|
||||
return
|
||||
# No running tasks
|
||||
invoke_state_api(
|
||||
lambda res: len(res) == 0,
|
||||
list_tasks,
|
||||
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
|
||||
key_suffix="0",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
# Task definition adopted from:
|
||||
# https://docs.ray.io/en/master/ray-core/examples/highly_parallel.html
|
||||
from random import random
|
||||
|
||||
SAMPLES = 100
|
||||
|
||||
@ray.remote
|
||||
def pi4_sample(signal):
|
||||
in_count = 0
|
||||
for _ in range(SAMPLES):
|
||||
x, y = random(), random()
|
||||
if x * x + y * y <= 1:
|
||||
in_count += 1
|
||||
# Block on signal
|
||||
ray.get(signal.wait.remote())
|
||||
return in_count
|
||||
|
||||
results = []
|
||||
signal = SignalActor.remote()
|
||||
for _ in tqdm.trange(num_tasks, desc="Launching tasks"):
|
||||
results.append(pi4_sample.remote(signal))
|
||||
|
||||
invoke_state_api_n(
|
||||
lambda res: len(res) == num_tasks,
|
||||
list_tasks,
|
||||
filters=[("name", "=", "pi4_sample()")],
|
||||
key_suffix=f"{num_tasks}",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
print("Waiting for tasks to finish...")
|
||||
ray.get(signal.send.remote())
|
||||
ray.get(results)
|
||||
|
||||
# Clean up
|
||||
# All compute tasks done other than the signal actor
|
||||
invoke_state_api(
|
||||
lambda res: len(res) == 0,
|
||||
list_tasks,
|
||||
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
|
||||
key_suffix="0",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
del signal
|
||||
|
||||
|
||||
def test_many_actors(num_actors: int):
|
||||
if num_actors == 0:
|
||||
print("Skipping test with no actors")
|
||||
return
|
||||
|
||||
@ray.remote
|
||||
class TestActor:
|
||||
def running(self):
|
||||
return True
|
||||
|
||||
def exit(self):
|
||||
ray.actor.exit_actor()
|
||||
|
||||
actor_class_name = TestActor.__ray_metadata__.class_name
|
||||
|
||||
invoke_state_api(
|
||||
lambda res: len(res) == 0,
|
||||
list_actors,
|
||||
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||
key_suffix="0",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
actors = [
|
||||
TestActor.remote() for _ in tqdm.trange(num_actors, desc="Launching actors...")
|
||||
]
|
||||
|
||||
waiting_actors = [actor.running.remote() for actor in actors]
|
||||
print("Waiting for actors to finish...")
|
||||
ray.get(waiting_actors)
|
||||
|
||||
invoke_state_api_n(
|
||||
lambda res: len(res) == num_actors,
|
||||
list_actors,
|
||||
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||
key_suffix=f"{num_actors}",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
exiting_actors = [actor.exit.remote() for actor in actors]
|
||||
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
|
||||
_exitted, exiting_actors = ray.wait(exiting_actors)
|
||||
|
||||
invoke_state_api(
|
||||
lambda res: len(res) == 0,
|
||||
list_actors,
|
||||
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||
key_suffix="0",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
|
||||
def test_many_objects(num_objects, num_actors):
|
||||
if num_objects == 0:
|
||||
print("Skipping test with no objects")
|
||||
return
|
||||
|
||||
@ray.remote(num_cpus=0.1)
|
||||
class ObjectActor:
|
||||
def __init__(self):
|
||||
self.objs = []
|
||||
|
||||
def create_objs(self, num_objects):
|
||||
import os
|
||||
|
||||
for _ in range(num_objects):
|
||||
# Object size shouldn't matter here.
|
||||
self.objs.append(ray.put(bytearray(os.urandom(1024))))
|
||||
|
||||
return self.objs
|
||||
|
||||
def exit(self):
|
||||
ray.actor.exit_actor()
|
||||
|
||||
actors = [
|
||||
ObjectActor.remote() for _ in tqdm.trange(num_actors, desc="Creating actors...")
|
||||
]
|
||||
|
||||
# Splitting objects to multiple actors for creation,
|
||||
# credit: https://stackoverflow.com/a/2135920
|
||||
def _split(a, n):
|
||||
k, m = divmod(len(a), n)
|
||||
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||
|
||||
num_objs_per_actor = [len(objs) for objs in _split(range(num_objects), num_actors)]
|
||||
|
||||
waiting_actors = [
|
||||
actor.create_objs.remote(num_objs)
|
||||
for actor, num_objs in zip(actors, num_objs_per_actor)
|
||||
]
|
||||
|
||||
total_objs_created = 0
|
||||
for _ in tqdm.trange(num_actors, desc="Waiting actors to create objects..."):
|
||||
objs, waiting_actors = ray.wait(waiting_actors)
|
||||
total_objs_created += len(ray.get(*objs))
|
||||
|
||||
assert (
|
||||
total_objs_created == num_objects
|
||||
), "Expect correct number of objects created."
|
||||
|
||||
invoke_state_api_n(
|
||||
lambda res: len(res) == num_objects,
|
||||
list_objects,
|
||||
filters=[
|
||||
("reference_type", "=", "LOCAL_REFERENCE"),
|
||||
("type", "=", "Worker"),
|
||||
],
|
||||
key_suffix=f"{num_objects}",
|
||||
limit=STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
exiting_actors = [actor.exit.remote() for actor in actors]
|
||||
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
|
||||
_exitted, exiting_actors = ray.wait(exiting_actors)
|
||||
|
||||
|
||||
def test_large_log_file(log_file_size_byte: int):
|
||||
if log_file_size_byte == 0:
|
||||
print("Skipping test with 0 log file size")
|
||||
return
|
||||
|
||||
import sys
|
||||
import string
|
||||
import random
|
||||
import hashlib
|
||||
|
||||
@ray.remote
|
||||
class LogActor:
|
||||
def write_log(self, log_file_size_byte: int):
|
||||
ctx = hashlib.md5()
|
||||
prefix = f"{LOG_PREFIX_ACTOR_NAME}LogActor\n"
|
||||
ctx.update(prefix.encode())
|
||||
while log_file_size_byte > 0:
|
||||
n = min(log_file_size_byte, 4 * MiB)
|
||||
chunk = "".join(random.choices(string.ascii_letters, k=n))
|
||||
sys.stdout.writelines([chunk])
|
||||
ctx.update(chunk.encode())
|
||||
log_file_size_byte -= n
|
||||
|
||||
sys.stdout.flush()
|
||||
return ctx.hexdigest(), ray.get_runtime_context().node_id.hex()
|
||||
|
||||
actor = LogActor.remote()
|
||||
expected_hash, node_id = ray.get(
|
||||
actor.write_log.remote(log_file_size_byte=log_file_size_byte)
|
||||
)
|
||||
assert expected_hash is not None, "Empty checksum from the log actor"
|
||||
assert node_id is not None, "Empty node id from the log actor"
|
||||
|
||||
# Retrieve the log and compare the checksum
|
||||
ctx = hashlib.md5()
|
||||
|
||||
time_taken = 0
|
||||
t_start = time.perf_counter()
|
||||
for s in get_log(actor_id=actor._actor_id.hex(), tail=-1):
|
||||
t_end = time.perf_counter()
|
||||
time_taken += t_end - t_start
|
||||
# Not including this time
|
||||
ctx.update(s.encode())
|
||||
# Only time the iterator's performance
|
||||
t_start = time.perf_counter()
|
||||
|
||||
assert expected_hash == ctx.hexdigest(), "Mismatch log file"
|
||||
|
||||
metric = StateAPIMetric(time_taken, log_file_size_byte)
|
||||
GLOBAL_STATE_STATS.calls["get_log"].append(metric)
|
||||
|
||||
|
||||
def _parse_input(
|
||||
num_tasks_str: str, num_actors_str: str, num_objects_str: str, log_file_sizes: str
|
||||
):
|
||||
def _split_to_int(s):
|
||||
tokens = s.split(",")
|
||||
return [int(token) for token in tokens]
|
||||
|
||||
return (
|
||||
_split_to_int(num_tasks_str),
|
||||
_split_to_int(num_actors_str),
|
||||
_split_to_int(num_objects_str),
|
||||
_split_to_int(log_file_sizes),
|
||||
)
|
||||
|
||||
|
||||
def no_resource_leaks():
|
||||
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--num-tasks",
|
||||
required=False,
|
||||
default="1,100,1000,10000",
|
||||
type=str,
|
||||
help="Number of tasks to launch.",
|
||||
)
|
||||
@click.option(
|
||||
"--num-actors",
|
||||
required=False,
|
||||
default="1,100,1000,5000",
|
||||
type=str,
|
||||
help="Number of actors to launch.",
|
||||
)
|
||||
@click.option(
|
||||
"--num-objects",
|
||||
required=False,
|
||||
default="100,1000,10000,50000",
|
||||
type=str,
|
||||
help="Number of actors to launch.",
|
||||
)
|
||||
@click.option(
|
||||
"--num-actors-for-objects",
|
||||
required=False,
|
||||
default=16,
|
||||
type=int,
|
||||
help="Number of actors to use for object creation.",
|
||||
)
|
||||
@click.option(
|
||||
"--log-file-size-byte",
|
||||
required=False,
|
||||
default=f"{256*MiB},{1*GiB},{4*GiB}",
|
||||
type=str,
|
||||
help="Number of actors to launch.",
|
||||
)
|
||||
@click.option(
|
||||
"--smoke-test",
|
||||
is_flag=True,
|
||||
type=bool,
|
||||
default=False,
|
||||
help="If set, it's a smoke test",
|
||||
)
|
||||
def test(
|
||||
num_tasks,
|
||||
num_actors,
|
||||
num_objects,
|
||||
num_actors_for_objects,
|
||||
log_file_size_byte,
|
||||
smoke_test,
|
||||
):
|
||||
ray.init(address="auto", log_to_driver=False)
|
||||
|
||||
if smoke_test:
|
||||
num_tasks = "100"
|
||||
num_actors = "10"
|
||||
num_objects = "100"
|
||||
log_file_size_byte = f"{16*MiB}"
|
||||
|
||||
# Parse the input
|
||||
num_tasks_arr, num_actors_arr, num_objects_arr, log_file_size_arr = _parse_input(
|
||||
num_tasks, num_actors, num_objects, log_file_size_byte
|
||||
)
|
||||
|
||||
test_utils.wait_for_condition(no_resource_leaks)
|
||||
monitor_actor = test_utils.monitor_memory_usage()
|
||||
start_time = time.perf_counter()
|
||||
# Run some long-running tasks
|
||||
for n in num_tasks_arr:
|
||||
print(f"\nRunning with many tasks={n}")
|
||||
test_many_tasks(num_tasks=n)
|
||||
print(f"\ntest_many_tasks({n}) PASS")
|
||||
|
||||
# Run many actors
|
||||
for n in num_actors_arr:
|
||||
print(f"\nRunning with many actors={n}")
|
||||
test_many_actors(num_actors=n)
|
||||
print(f"\ntest_many_actors({n}) PASS")
|
||||
|
||||
# Create many objects
|
||||
for n in num_objects_arr:
|
||||
print(f"\nRunning with many objects={n}")
|
||||
test_many_objects(num_objects=n, num_actors=num_actors_for_objects)
|
||||
print(f"\ntest_many_objects({n}) PASS")
|
||||
|
||||
# Create large logs
|
||||
for n in log_file_size_arr:
|
||||
print(f"\nRunning with large file={n} bytes")
|
||||
test_large_log_file(log_file_size_byte=n)
|
||||
print(f"\ntest_large_log_file({n} bytes) PASS")
|
||||
|
||||
print("\n\nPASS")
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Collect mem usage
|
||||
ray.get(monitor_actor.stop_run.remote())
|
||||
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
|
||||
print(f"Peak memory usage: {round(used_gb, 2)}GB")
|
||||
print(f"Peak memory usage per processes:\n {usage}")
|
||||
del monitor_actor
|
||||
|
||||
state_perf_result = aggregate_perf_results()
|
||||
results = {
|
||||
"time": end_time - start_time,
|
||||
"success": "1",
|
||||
"_peak_memory": round(used_gb, 2),
|
||||
"_peak_process_memory": usage,
|
||||
"perf_metrics": [
|
||||
{
|
||||
"perf_metric_name": "avg_state_api_latency_sec",
|
||||
"perf_metric_value": state_perf_result["avg_state_api_latency_sec"],
|
||||
"perf_metric_type": "LATENCY",
|
||||
}
|
||||
],
|
||||
}
|
||||
if "TEST_OUTPUT_JSON" in os.environ:
|
||||
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
|
||||
json.dump(results, out_file)
|
||||
|
||||
results.update(state_perf_result)
|
||||
print(json.dumps(results, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
|
@ -0,0 +1,215 @@
|
|||
import time
|
||||
from typing import Dict, List
|
||||
import click
|
||||
import json
|
||||
import os
|
||||
|
||||
import ray
|
||||
|
||||
from ray.experimental.state.api import (
|
||||
list_actors,
|
||||
list_nodes,
|
||||
list_objects,
|
||||
list_tasks,
|
||||
summarize_actors,
|
||||
summarize_objects,
|
||||
summarize_tasks,
|
||||
)
|
||||
|
||||
import ray._private.test_utils as test_utils
|
||||
|
||||
from ray._private.state_api_test_utils import (
|
||||
StateAPICallSpec,
|
||||
periodic_invoke_state_apis_with_actor,
|
||||
STATE_LIST_LIMIT,
|
||||
)
|
||||
|
||||
|
||||
def download_release_test(test_file_path: str) -> None:
|
||||
"""Download the release test file from github.
|
||||
|
||||
It is currently assumed individual release test is independent from each other,
|
||||
and isolated in its own file.
|
||||
|
||||
This always downloads the file into current working directory so that this
|
||||
python script could invoke the release test w/o path imports.
|
||||
|
||||
Args:
|
||||
test_file_path: File path relevant to the `/release` folder.
|
||||
|
||||
Return:
|
||||
Basename (file name) of the test file path if download successfully.
|
||||
"""
|
||||
import urllib.request as rq
|
||||
import urllib.parse as parse
|
||||
|
||||
RAW_RAY_GITHUB_URL = (
|
||||
"https://raw.githubusercontent.com/ray-project/ray/master/release/"
|
||||
)
|
||||
file_name = os.path.basename(test_file_path)
|
||||
try:
|
||||
rq.urlretrieve(parse.urljoin(RAW_RAY_GITHUB_URL, test_file_path), file_name)
|
||||
return file_name
|
||||
except Exception as e:
|
||||
print(f"Failed to retrieve :{test_file_path} :\n{e}")
|
||||
return None
|
||||
|
||||
|
||||
def cleanup_release_test(test_file_name: str) -> bool:
|
||||
try:
|
||||
os.remove(test_file_name)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to remove file: {test_file_name}: \n{e}")
|
||||
return False
|
||||
|
||||
|
||||
def run_release_test_in_subprocess(test_file: str, args: List[str]) -> bool:
|
||||
import subprocess as sp
|
||||
|
||||
# Run the test in subprocess
|
||||
cmds = ["python", test_file, *args]
|
||||
|
||||
print(f"Running: {' '.join(cmds)}")
|
||||
proc = None
|
||||
try:
|
||||
proc = sp.run(cmds, check=True, text=True, capture_output=True)
|
||||
proc.check_returncode()
|
||||
return True
|
||||
except sp.CalledProcessError as e:
|
||||
print(f"Failed to run :{' '.join(cmds)}")
|
||||
print(e)
|
||||
print(e.stdout)
|
||||
print(e.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def run_test(test_name: str, test_args: List[str]):
|
||||
|
||||
monitor_actor = test_utils.monitor_memory_usage()
|
||||
|
||||
start = time.perf_counter()
|
||||
run_release_test_in_subprocess(test_name, test_args)
|
||||
end = time.perf_counter()
|
||||
# Collect mem usage
|
||||
ray.get(monitor_actor.stop_run.remote())
|
||||
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
|
||||
|
||||
results = {
|
||||
"duration": end - start,
|
||||
"peak_memory": round(used_gb, 2),
|
||||
"peak_process_memory": usage,
|
||||
}
|
||||
return results
|
||||
|
||||
|
||||
def run_test_with_state_api(
|
||||
test_name: str,
|
||||
test_args: List[str],
|
||||
apis: List[StateAPICallSpec],
|
||||
call_interval_s: int = 3,
|
||||
print_interval_s: int = 15,
|
||||
) -> Dict:
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# Stage 1: Run with state APIs
|
||||
api_caller = periodic_invoke_state_apis_with_actor(
|
||||
apis=apis, call_interval_s=call_interval_s, print_interval_s=print_interval_s
|
||||
)
|
||||
|
||||
stats_with_state_apis = run_test(test_name, test_args)
|
||||
ray.get(api_caller.stop.remote())
|
||||
print(json.dumps(ray.get(api_caller.get_stats.remote()), indent=2))
|
||||
|
||||
# Stage 2: Run without API generator
|
||||
stats_without_state_apis = run_test(test_name, test_args)
|
||||
end_time = time.perf_counter()
|
||||
|
||||
# Dumping results
|
||||
results = {
|
||||
"time": end_time - start_time,
|
||||
"success": "1",
|
||||
"perf_metrics": [
|
||||
{
|
||||
"perf_metric_name": "state_api_extra_latency_sec",
|
||||
"perf_metric_value": stats_with_state_apis["duration"]
|
||||
- stats_without_state_apis["duration"],
|
||||
"perf_metric_type": "LATENCY",
|
||||
},
|
||||
{
|
||||
"perf_metric_name": "state_api_extra_latency_sec_percentage",
|
||||
"perf_metric_value": (
|
||||
stats_with_state_apis["duration"]
|
||||
/ stats_without_state_apis["duration"]
|
||||
- 1
|
||||
)
|
||||
* 100,
|
||||
"perf_metric_type": "LATENCY",
|
||||
},
|
||||
{
|
||||
"perf_metric_name": "state_api_extra_mem",
|
||||
"perf_metric_value": stats_with_state_apis["peak_memory"]
|
||||
- stats_without_state_apis["peak_memory"],
|
||||
"perf_metric_type": "MEMORY",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument(
|
||||
"test_path",
|
||||
)
|
||||
@click.option(
|
||||
"--test-args",
|
||||
type=str,
|
||||
)
|
||||
@click.option(
|
||||
"--call-interval-s", type=int, default=3, help="interval of state api calls"
|
||||
)
|
||||
def test(
|
||||
test_path,
|
||||
test_args,
|
||||
call_interval_s,
|
||||
):
|
||||
|
||||
# Set up state API calling methods
|
||||
def not_none(res):
|
||||
return res is not None
|
||||
|
||||
apis = [
|
||||
StateAPICallSpec(list_nodes, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||
StateAPICallSpec(list_objects, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||
StateAPICallSpec(list_tasks, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||
StateAPICallSpec(list_actors, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||
StateAPICallSpec(summarize_tasks, not_none),
|
||||
StateAPICallSpec(summarize_actors, not_none),
|
||||
StateAPICallSpec(summarize_objects, not_none),
|
||||
]
|
||||
|
||||
# Set up benchmark test by downloading the release test file directly
|
||||
test_name = download_release_test(test_path)
|
||||
assert test_name is not None, f"Failed to retrieve release test: {test_path}"
|
||||
|
||||
ray.init()
|
||||
results = run_test_with_state_api(
|
||||
test_name,
|
||||
test_args.split(),
|
||||
apis,
|
||||
call_interval_s=call_interval_s,
|
||||
)
|
||||
|
||||
if "TEST_OUTPUT_JSON" in os.environ:
|
||||
# This will overwrite all other release tests result
|
||||
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
|
||||
json.dump(results, out_file)
|
||||
print(json.dumps(results, indent=2))
|
||||
|
||||
assert cleanup_release_test(test_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
|
@ -3572,6 +3572,59 @@
|
|||
wait_for_nodes:
|
||||
num_nodes: 5
|
||||
|
||||
- name: stress_test_state_api_scale
|
||||
group: core-daily-test
|
||||
working_dir: nightly_tests
|
||||
legacy:
|
||||
test_name: stress_test_state_api_scale
|
||||
test_suite: nightly_tests
|
||||
stable: false
|
||||
|
||||
frequency: nightly
|
||||
team: core
|
||||
cluster:
|
||||
cluster_env: stress_tests/stress_tests_app_config.yaml
|
||||
cluster_compute: stress_tests/stress_tests_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python stress_tests/test_state_api_scale.py
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
smoke_test:
|
||||
frequency: multi
|
||||
cluster:
|
||||
app_config: stress_tests/stress_tests_app_config.yaml
|
||||
cluster_compute: stress_tests/smoke_test_compute.yaml
|
||||
|
||||
run:
|
||||
timeout: 3600
|
||||
script: python stress_tests/test_state_api_scale.py --smoke-test
|
||||
|
||||
|
||||
- name: shuffle_20gb_with_state_api
|
||||
group: core-daily-test
|
||||
working_dir: nightly_tests
|
||||
legacy:
|
||||
test_name: shuffle_20gb_with_state_api
|
||||
test_suite: nightly_tests
|
||||
stable: false
|
||||
|
||||
frequency: nightly
|
||||
team: core
|
||||
cluster:
|
||||
cluster_env: shuffle/shuffle_app_config.yaml
|
||||
cluster_compute: shuffle/shuffle_compute_single.yaml
|
||||
|
||||
run:
|
||||
timeout: 1000
|
||||
script: python stress_tests/test_state_api_with_other_tests.py
|
||||
nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
|
||||
|
||||
type: sdk_command
|
||||
file_manager: sdk
|
||||
|
||||
- name: stress_test_many_tasks
|
||||
group: core-daily-test
|
||||
working_dir: nightly_tests
|
||||
|
|
Loading…
Add table
Reference in a new issue