mirror of
https://github.com/vale981/ray
synced 2025-03-05 18:11:42 -05:00
[Core][State Observability] Nightly release test for state API (#26610)
* Initial * Correctness test skeleton * Added limit for listing * Updated grpc config * no more waiting * metrics * Updated constant and add test * renamed * actors * actors * actors * dada * actor dead? * Script * correct test name * limit * Added timeout * release test /2 * Merged * format+doc * wip Signed-off-by: rickyyx <ricky@anyscale.com> * revert packag-lock Signed-off-by: rickyyx <rickyx@anyscale.com> * wip * results Signed-off-by: rickyx <rickyx@anyscale.com> Signed-off-by: rickyyx <rickyx@anyscale.com> Signed-off-by: rickyyx <ricky@anyscale.com> Signed-off-by: rickyx <rickyx@anyscale.com> Co-authored-by: rickyyx <ricky@anyscale.com>
This commit is contained in:
parent
0dceddb912
commit
5ea4747448
4 changed files with 985 additions and 0 deletions
294
python/ray/_private/state_api_test_utils.py
Normal file
294
python/ray/_private/state_api_test_utils.py
Normal file
|
@ -0,0 +1,294 @@
|
||||||
|
import asyncio
|
||||||
|
from copy import deepcopy
|
||||||
|
from collections import defaultdict
|
||||||
|
import concurrent.futures
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
import pprint
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from typing import Callable, Dict, List, Optional
|
||||||
|
import ray
|
||||||
|
from ray.actor import ActorHandle
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StateAPIMetric:
|
||||||
|
latency_sec: float
|
||||||
|
result_size: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StateAPICallSpec:
|
||||||
|
api: Callable
|
||||||
|
verify_cb: Callable
|
||||||
|
kwargs: Dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StateAPIStats:
|
||||||
|
pending_calls: int = 0
|
||||||
|
total_calls: int = 0
|
||||||
|
calls: Dict = field(default_factory=lambda: defaultdict(list))
|
||||||
|
|
||||||
|
|
||||||
|
GLOBAL_STATE_STATS = StateAPIStats()
|
||||||
|
|
||||||
|
STATE_LIST_LIMIT = int(1e6) # 1m
|
||||||
|
STATE_LIST_TIMEOUT = 600 # 10min
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_state_api(
|
||||||
|
verify_cb: Callable,
|
||||||
|
state_api_fn: Callable,
|
||||||
|
state_stats: StateAPIStats = GLOBAL_STATE_STATS,
|
||||||
|
key_suffix: Optional[str] = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Invoke a State API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- verify_cb: Callback that takes in the response from `state_api_fn` and
|
||||||
|
returns a boolean, indicating the correctness of the results.
|
||||||
|
- state_api_fn: Function of the state API
|
||||||
|
- state_stats: Stats
|
||||||
|
- kwargs: Keyword arguments to be forwarded to the `state_api_fn`
|
||||||
|
"""
|
||||||
|
if "timeout" not in kwargs:
|
||||||
|
kwargs["timeout"] = STATE_LIST_TIMEOUT
|
||||||
|
|
||||||
|
# Suppress missing output warning
|
||||||
|
kwargs["raise_on_missing_output"] = False
|
||||||
|
|
||||||
|
res = None
|
||||||
|
try:
|
||||||
|
state_stats.total_calls += 1
|
||||||
|
state_stats.pending_calls += 1
|
||||||
|
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
res = state_api_fn(**kwargs)
|
||||||
|
t_end = time.perf_counter()
|
||||||
|
|
||||||
|
metric = StateAPIMetric(t_end - t_start, len(res))
|
||||||
|
if key_suffix:
|
||||||
|
key = f"{state_api_fn.__name__}_{key_suffix}"
|
||||||
|
else:
|
||||||
|
key = state_api_fn.__name__
|
||||||
|
state_stats.calls[key].append(metric)
|
||||||
|
assert verify_cb(res), f"Calling State API failed. len(res)=({len(res)}): {res}"
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
assert (
|
||||||
|
False
|
||||||
|
), f"Calling {state_api_fn.__name__}({kwargs}) failed with {repr(e)}."
|
||||||
|
finally:
|
||||||
|
state_stats.pending_calls -= 1
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_perf_results(state_stats: StateAPIStats = GLOBAL_STATE_STATS):
|
||||||
|
"""Aggregate stats of state API calls
|
||||||
|
|
||||||
|
Return:
|
||||||
|
This returns a dict of below fields:
|
||||||
|
- max_{api_key_name}_latency_sec:
|
||||||
|
Max latency of call to {api_key_name}
|
||||||
|
- {api_key_name}_result_size_with_max_latency:
|
||||||
|
The size of the result (or the number of bytes for get_log API)
|
||||||
|
for the max latency invocation
|
||||||
|
- avg/p99/p95/p50_{api_key_name}_latency_sec:
|
||||||
|
The percentile latency stats
|
||||||
|
- avg_state_api_latency_sec:
|
||||||
|
The average latency of all the state apis tracked
|
||||||
|
"""
|
||||||
|
# Prevent iteration when modifying error
|
||||||
|
state_stats = deepcopy(state_stats)
|
||||||
|
perf_result = {}
|
||||||
|
for api_key_name, metrics in state_stats.calls.items():
|
||||||
|
# Per api aggregation
|
||||||
|
# Max latency
|
||||||
|
latency_key = f"max_{api_key_name}_latency_sec"
|
||||||
|
size_key = f"{api_key_name}_result_size_with_max_latency"
|
||||||
|
metric = max(metrics, key=lambda metric: metric.latency_sec)
|
||||||
|
|
||||||
|
perf_result[latency_key] = metric.latency_sec
|
||||||
|
perf_result[size_key] = metric.result_size
|
||||||
|
|
||||||
|
latency_list = np.array([metric.latency_sec for metric in metrics])
|
||||||
|
# avg latency
|
||||||
|
key = f"avg_{api_key_name}_latency_sec"
|
||||||
|
perf_result[key] = np.average(latency_list)
|
||||||
|
|
||||||
|
# p99 latency
|
||||||
|
key = f"p99_{api_key_name}_latency_sec"
|
||||||
|
perf_result[key] = np.percentile(latency_list, 99)
|
||||||
|
|
||||||
|
# p95 latency
|
||||||
|
key = f"p95_{api_key_name}_latency_sec"
|
||||||
|
perf_result[key] = np.percentile(latency_list, 95)
|
||||||
|
|
||||||
|
# p50 latency
|
||||||
|
key = f"p50_{api_key_name}_latency_sec"
|
||||||
|
perf_result[key] = np.percentile(latency_list, 50)
|
||||||
|
|
||||||
|
all_state_api_latency = sum(
|
||||||
|
metric.latency_sec
|
||||||
|
for metric_samples in state_stats.calls.values()
|
||||||
|
for metric in metric_samples
|
||||||
|
)
|
||||||
|
|
||||||
|
perf_result["avg_state_api_latency_sec"] = (
|
||||||
|
(all_state_api_latency / state_stats.total_calls)
|
||||||
|
if state_stats.total_calls != 0
|
||||||
|
else -1
|
||||||
|
)
|
||||||
|
|
||||||
|
return perf_result
|
||||||
|
|
||||||
|
|
||||||
|
@ray.remote
|
||||||
|
class StateAPIGeneratorActor:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
apis: List[StateAPICallSpec],
|
||||||
|
call_interval_s: float = 5.0,
|
||||||
|
print_interval_s: float = 20.0,
|
||||||
|
wait_after_stop: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""An actor that periodically issues state API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- apis: List of StateAPICallSpec
|
||||||
|
- call_interval_s: State apis in the `apis` will be issued
|
||||||
|
every `call_interval_s` seconds.
|
||||||
|
- print_interval_s: How frequent state api stats will be dumped.
|
||||||
|
- wait_after_stop: When true, call to `ray.get(actor.stop.remote())`
|
||||||
|
will wait for all pending state APIs to return.
|
||||||
|
Setting it to `False` might miss some long-running state apis calls.
|
||||||
|
"""
|
||||||
|
# Configs
|
||||||
|
self._apis = apis
|
||||||
|
self._call_interval_s = call_interval_s
|
||||||
|
self._print_interval_s = print_interval_s
|
||||||
|
self._wait_after_cancel = wait_after_stop
|
||||||
|
self._logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
# States
|
||||||
|
self._tasks = None
|
||||||
|
self._fut_queue = None
|
||||||
|
self._executor = None
|
||||||
|
self._loop = None
|
||||||
|
self._stopping = False
|
||||||
|
self._stopped = False
|
||||||
|
self._stats = StateAPIStats()
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
# Run the periodic api generator
|
||||||
|
self._fut_queue = asyncio.Queue()
|
||||||
|
self._executor = concurrent.futures.ThreadPoolExecutor()
|
||||||
|
|
||||||
|
self._tasks = [
|
||||||
|
asyncio.ensure_future(awt)
|
||||||
|
for awt in [
|
||||||
|
self._run_generator(),
|
||||||
|
self._run_result_waiter(),
|
||||||
|
self._run_stats_reporter(),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
await asyncio.gather(*self._tasks)
|
||||||
|
|
||||||
|
def call(self, fn, verify_cb, **kwargs):
|
||||||
|
def run_fn():
|
||||||
|
try:
|
||||||
|
self._logger.debug(f"calling {fn.__name__}({kwargs})")
|
||||||
|
return invoke_state_api(
|
||||||
|
verify_cb, fn, state_stats=self._stats, **kwargs
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self._logger.warning(f"{fn.__name__}({kwargs}) failed with: {repr(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
fut = asyncio.get_running_loop().run_in_executor(self._executor, run_fn)
|
||||||
|
return fut
|
||||||
|
|
||||||
|
async def _run_stats_reporter(self):
|
||||||
|
while not self._stopped:
|
||||||
|
# Keep the reporter running until all pending apis finish and the bool
|
||||||
|
# `self._stopped` is then True
|
||||||
|
self._logger.info(pprint.pprint(aggregate_perf_results(self._stats)))
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(self._print_interval_s)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
self._logger.info(
|
||||||
|
"_run_stats_reporter cancelled, "
|
||||||
|
f"waiting for all api {self._stats.pending_calls}calls to return..."
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _run_generator(self):
|
||||||
|
try:
|
||||||
|
while not self._stopping:
|
||||||
|
# Run the state API in another thread
|
||||||
|
for api_spec in self._apis:
|
||||||
|
fut = self.call(api_spec.api, api_spec.verify_cb, **api_spec.kwargs)
|
||||||
|
self._fut_queue.put_nowait(fut)
|
||||||
|
|
||||||
|
await asyncio.sleep(self._call_interval_s)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
# Stop running
|
||||||
|
self._logger.info("_run_generator cancelled, now stopping...")
|
||||||
|
return
|
||||||
|
|
||||||
|
async def _run_result_waiter(self):
|
||||||
|
try:
|
||||||
|
while not self._stopping:
|
||||||
|
fut = await self._fut_queue.get()
|
||||||
|
await fut
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
self._logger.info(
|
||||||
|
f"_run_result_waiter cancelled, cancelling {self._fut_queue.qsize()} "
|
||||||
|
"pending futures..."
|
||||||
|
)
|
||||||
|
while not self._fut_queue.empty():
|
||||||
|
fut = self._fut_queue.get_nowait()
|
||||||
|
if self._wait_after_cancel:
|
||||||
|
await fut
|
||||||
|
else:
|
||||||
|
# Ignore the queue futures if we are not
|
||||||
|
# waiting on them after stop() called
|
||||||
|
fut.cancel()
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_stats(self):
|
||||||
|
# deep copy to prevent race between reporting and modifying stats
|
||||||
|
return aggregate_perf_results(self._stats)
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._stopping = True
|
||||||
|
self._logger.debug(f"calling stop, canceling {len(self._tasks)} tasks")
|
||||||
|
for task in self._tasks:
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
# This will block the stop() function until all futures are cancelled
|
||||||
|
# if _wait_after_cancel=True. When _wait_after_cancel=False, it will still
|
||||||
|
# wait for any in-progress futures.
|
||||||
|
# See: https://docs.python.org/3.8/library/concurrent.futures.html
|
||||||
|
self._executor.shutdown(wait=self._wait_after_cancel)
|
||||||
|
self._stopped = True
|
||||||
|
|
||||||
|
|
||||||
|
def periodic_invoke_state_apis_with_actor(*args, **kwargs) -> ActorHandle:
|
||||||
|
current_node_ip = ray._private.worker.global_worker.node_ip_address
|
||||||
|
# Schedule the actor on the current node.
|
||||||
|
actor = StateAPIGeneratorActor.options(
|
||||||
|
resources={f"node:{current_node_ip}": 0.001}
|
||||||
|
).remote(*args, **kwargs)
|
||||||
|
print("Waiting for state api actor to be ready...")
|
||||||
|
ray.get(actor.ready.remote())
|
||||||
|
print("State api actor is ready now.")
|
||||||
|
actor.start.remote()
|
||||||
|
return actor
|
423
release/nightly_tests/stress_tests/test_state_api_scale.py
Normal file
423
release/nightly_tests/stress_tests/test_state_api_scale.py
Normal file
|
@ -0,0 +1,423 @@
|
||||||
|
import click
|
||||||
|
import json
|
||||||
|
import ray
|
||||||
|
from ray._private.ray_constants import LOG_PREFIX_ACTOR_NAME
|
||||||
|
from ray._private.state_api_test_utils import (
|
||||||
|
STATE_LIST_LIMIT,
|
||||||
|
StateAPIMetric,
|
||||||
|
aggregate_perf_results,
|
||||||
|
invoke_state_api,
|
||||||
|
GLOBAL_STATE_STATS,
|
||||||
|
)
|
||||||
|
|
||||||
|
import ray._private.test_utils as test_utils
|
||||||
|
import tqdm
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ray.experimental.state.api import (
|
||||||
|
get_log,
|
||||||
|
list_actors,
|
||||||
|
list_objects,
|
||||||
|
list_tasks,
|
||||||
|
)
|
||||||
|
|
||||||
|
GiB = 1024 * 1024 * 1024
|
||||||
|
MiB = 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
|
# We set num_cpus to zero because this actor will mostly just block on I/O.
|
||||||
|
@ray.remote(num_cpus=0)
|
||||||
|
class SignalActor:
|
||||||
|
def __init__(self):
|
||||||
|
self.ready_event = asyncio.Event()
|
||||||
|
|
||||||
|
def send(self, clear=False):
|
||||||
|
self.ready_event.set()
|
||||||
|
if clear:
|
||||||
|
self.ready_event.clear()
|
||||||
|
|
||||||
|
async def wait(self, should_wait=True):
|
||||||
|
if should_wait:
|
||||||
|
await self.ready_event.wait()
|
||||||
|
|
||||||
|
|
||||||
|
def invoke_state_api_n(*args, **kwargs):
|
||||||
|
NUM_API_CALL_SAMPLES = 10
|
||||||
|
for _ in range(NUM_API_CALL_SAMPLES):
|
||||||
|
invoke_state_api(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_tasks(num_tasks: int):
|
||||||
|
if num_tasks == 0:
|
||||||
|
print("Skipping test with no tasks")
|
||||||
|
return
|
||||||
|
# No running tasks
|
||||||
|
invoke_state_api(
|
||||||
|
lambda res: len(res) == 0,
|
||||||
|
list_tasks,
|
||||||
|
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
|
||||||
|
key_suffix="0",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Task definition adopted from:
|
||||||
|
# https://docs.ray.io/en/master/ray-core/examples/highly_parallel.html
|
||||||
|
from random import random
|
||||||
|
|
||||||
|
SAMPLES = 100
|
||||||
|
|
||||||
|
@ray.remote
|
||||||
|
def pi4_sample(signal):
|
||||||
|
in_count = 0
|
||||||
|
for _ in range(SAMPLES):
|
||||||
|
x, y = random(), random()
|
||||||
|
if x * x + y * y <= 1:
|
||||||
|
in_count += 1
|
||||||
|
# Block on signal
|
||||||
|
ray.get(signal.wait.remote())
|
||||||
|
return in_count
|
||||||
|
|
||||||
|
results = []
|
||||||
|
signal = SignalActor.remote()
|
||||||
|
for _ in tqdm.trange(num_tasks, desc="Launching tasks"):
|
||||||
|
results.append(pi4_sample.remote(signal))
|
||||||
|
|
||||||
|
invoke_state_api_n(
|
||||||
|
lambda res: len(res) == num_tasks,
|
||||||
|
list_tasks,
|
||||||
|
filters=[("name", "=", "pi4_sample()")],
|
||||||
|
key_suffix=f"{num_tasks}",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Waiting for tasks to finish...")
|
||||||
|
ray.get(signal.send.remote())
|
||||||
|
ray.get(results)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
# All compute tasks done other than the signal actor
|
||||||
|
invoke_state_api(
|
||||||
|
lambda res: len(res) == 0,
|
||||||
|
list_tasks,
|
||||||
|
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
|
||||||
|
key_suffix="0",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
del signal
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_actors(num_actors: int):
|
||||||
|
if num_actors == 0:
|
||||||
|
print("Skipping test with no actors")
|
||||||
|
return
|
||||||
|
|
||||||
|
@ray.remote
|
||||||
|
class TestActor:
|
||||||
|
def running(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def exit(self):
|
||||||
|
ray.actor.exit_actor()
|
||||||
|
|
||||||
|
actor_class_name = TestActor.__ray_metadata__.class_name
|
||||||
|
|
||||||
|
invoke_state_api(
|
||||||
|
lambda res: len(res) == 0,
|
||||||
|
list_actors,
|
||||||
|
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||||
|
key_suffix="0",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
actors = [
|
||||||
|
TestActor.remote() for _ in tqdm.trange(num_actors, desc="Launching actors...")
|
||||||
|
]
|
||||||
|
|
||||||
|
waiting_actors = [actor.running.remote() for actor in actors]
|
||||||
|
print("Waiting for actors to finish...")
|
||||||
|
ray.get(waiting_actors)
|
||||||
|
|
||||||
|
invoke_state_api_n(
|
||||||
|
lambda res: len(res) == num_actors,
|
||||||
|
list_actors,
|
||||||
|
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||||
|
key_suffix=f"{num_actors}",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
exiting_actors = [actor.exit.remote() for actor in actors]
|
||||||
|
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
|
||||||
|
_exitted, exiting_actors = ray.wait(exiting_actors)
|
||||||
|
|
||||||
|
invoke_state_api(
|
||||||
|
lambda res: len(res) == 0,
|
||||||
|
list_actors,
|
||||||
|
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
|
||||||
|
key_suffix="0",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_many_objects(num_objects, num_actors):
|
||||||
|
if num_objects == 0:
|
||||||
|
print("Skipping test with no objects")
|
||||||
|
return
|
||||||
|
|
||||||
|
@ray.remote(num_cpus=0.1)
|
||||||
|
class ObjectActor:
|
||||||
|
def __init__(self):
|
||||||
|
self.objs = []
|
||||||
|
|
||||||
|
def create_objs(self, num_objects):
|
||||||
|
import os
|
||||||
|
|
||||||
|
for _ in range(num_objects):
|
||||||
|
# Object size shouldn't matter here.
|
||||||
|
self.objs.append(ray.put(bytearray(os.urandom(1024))))
|
||||||
|
|
||||||
|
return self.objs
|
||||||
|
|
||||||
|
def exit(self):
|
||||||
|
ray.actor.exit_actor()
|
||||||
|
|
||||||
|
actors = [
|
||||||
|
ObjectActor.remote() for _ in tqdm.trange(num_actors, desc="Creating actors...")
|
||||||
|
]
|
||||||
|
|
||||||
|
# Splitting objects to multiple actors for creation,
|
||||||
|
# credit: https://stackoverflow.com/a/2135920
|
||||||
|
def _split(a, n):
|
||||||
|
k, m = divmod(len(a), n)
|
||||||
|
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||||
|
|
||||||
|
num_objs_per_actor = [len(objs) for objs in _split(range(num_objects), num_actors)]
|
||||||
|
|
||||||
|
waiting_actors = [
|
||||||
|
actor.create_objs.remote(num_objs)
|
||||||
|
for actor, num_objs in zip(actors, num_objs_per_actor)
|
||||||
|
]
|
||||||
|
|
||||||
|
total_objs_created = 0
|
||||||
|
for _ in tqdm.trange(num_actors, desc="Waiting actors to create objects..."):
|
||||||
|
objs, waiting_actors = ray.wait(waiting_actors)
|
||||||
|
total_objs_created += len(ray.get(*objs))
|
||||||
|
|
||||||
|
assert (
|
||||||
|
total_objs_created == num_objects
|
||||||
|
), "Expect correct number of objects created."
|
||||||
|
|
||||||
|
invoke_state_api_n(
|
||||||
|
lambda res: len(res) == num_objects,
|
||||||
|
list_objects,
|
||||||
|
filters=[
|
||||||
|
("reference_type", "=", "LOCAL_REFERENCE"),
|
||||||
|
("type", "=", "Worker"),
|
||||||
|
],
|
||||||
|
key_suffix=f"{num_objects}",
|
||||||
|
limit=STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
exiting_actors = [actor.exit.remote() for actor in actors]
|
||||||
|
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
|
||||||
|
_exitted, exiting_actors = ray.wait(exiting_actors)
|
||||||
|
|
||||||
|
|
||||||
|
def test_large_log_file(log_file_size_byte: int):
|
||||||
|
if log_file_size_byte == 0:
|
||||||
|
print("Skipping test with 0 log file size")
|
||||||
|
return
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import string
|
||||||
|
import random
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
@ray.remote
|
||||||
|
class LogActor:
|
||||||
|
def write_log(self, log_file_size_byte: int):
|
||||||
|
ctx = hashlib.md5()
|
||||||
|
prefix = f"{LOG_PREFIX_ACTOR_NAME}LogActor\n"
|
||||||
|
ctx.update(prefix.encode())
|
||||||
|
while log_file_size_byte > 0:
|
||||||
|
n = min(log_file_size_byte, 4 * MiB)
|
||||||
|
chunk = "".join(random.choices(string.ascii_letters, k=n))
|
||||||
|
sys.stdout.writelines([chunk])
|
||||||
|
ctx.update(chunk.encode())
|
||||||
|
log_file_size_byte -= n
|
||||||
|
|
||||||
|
sys.stdout.flush()
|
||||||
|
return ctx.hexdigest(), ray.get_runtime_context().node_id.hex()
|
||||||
|
|
||||||
|
actor = LogActor.remote()
|
||||||
|
expected_hash, node_id = ray.get(
|
||||||
|
actor.write_log.remote(log_file_size_byte=log_file_size_byte)
|
||||||
|
)
|
||||||
|
assert expected_hash is not None, "Empty checksum from the log actor"
|
||||||
|
assert node_id is not None, "Empty node id from the log actor"
|
||||||
|
|
||||||
|
# Retrieve the log and compare the checksum
|
||||||
|
ctx = hashlib.md5()
|
||||||
|
|
||||||
|
time_taken = 0
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
for s in get_log(actor_id=actor._actor_id.hex(), tail=-1):
|
||||||
|
t_end = time.perf_counter()
|
||||||
|
time_taken += t_end - t_start
|
||||||
|
# Not including this time
|
||||||
|
ctx.update(s.encode())
|
||||||
|
# Only time the iterator's performance
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
|
||||||
|
assert expected_hash == ctx.hexdigest(), "Mismatch log file"
|
||||||
|
|
||||||
|
metric = StateAPIMetric(time_taken, log_file_size_byte)
|
||||||
|
GLOBAL_STATE_STATS.calls["get_log"].append(metric)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_input(
|
||||||
|
num_tasks_str: str, num_actors_str: str, num_objects_str: str, log_file_sizes: str
|
||||||
|
):
|
||||||
|
def _split_to_int(s):
|
||||||
|
tokens = s.split(",")
|
||||||
|
return [int(token) for token in tokens]
|
||||||
|
|
||||||
|
return (
|
||||||
|
_split_to_int(num_tasks_str),
|
||||||
|
_split_to_int(num_actors_str),
|
||||||
|
_split_to_int(num_objects_str),
|
||||||
|
_split_to_int(log_file_sizes),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def no_resource_leaks():
|
||||||
|
return test_utils.no_resource_leaks_excluding_node_resources()
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option(
|
||||||
|
"--num-tasks",
|
||||||
|
required=False,
|
||||||
|
default="1,100,1000,10000",
|
||||||
|
type=str,
|
||||||
|
help="Number of tasks to launch.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--num-actors",
|
||||||
|
required=False,
|
||||||
|
default="1,100,1000,5000",
|
||||||
|
type=str,
|
||||||
|
help="Number of actors to launch.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--num-objects",
|
||||||
|
required=False,
|
||||||
|
default="100,1000,10000,50000",
|
||||||
|
type=str,
|
||||||
|
help="Number of actors to launch.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--num-actors-for-objects",
|
||||||
|
required=False,
|
||||||
|
default=16,
|
||||||
|
type=int,
|
||||||
|
help="Number of actors to use for object creation.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--log-file-size-byte",
|
||||||
|
required=False,
|
||||||
|
default=f"{256*MiB},{1*GiB},{4*GiB}",
|
||||||
|
type=str,
|
||||||
|
help="Number of actors to launch.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--smoke-test",
|
||||||
|
is_flag=True,
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
help="If set, it's a smoke test",
|
||||||
|
)
|
||||||
|
def test(
|
||||||
|
num_tasks,
|
||||||
|
num_actors,
|
||||||
|
num_objects,
|
||||||
|
num_actors_for_objects,
|
||||||
|
log_file_size_byte,
|
||||||
|
smoke_test,
|
||||||
|
):
|
||||||
|
ray.init(address="auto", log_to_driver=False)
|
||||||
|
|
||||||
|
if smoke_test:
|
||||||
|
num_tasks = "100"
|
||||||
|
num_actors = "10"
|
||||||
|
num_objects = "100"
|
||||||
|
log_file_size_byte = f"{16*MiB}"
|
||||||
|
|
||||||
|
# Parse the input
|
||||||
|
num_tasks_arr, num_actors_arr, num_objects_arr, log_file_size_arr = _parse_input(
|
||||||
|
num_tasks, num_actors, num_objects, log_file_size_byte
|
||||||
|
)
|
||||||
|
|
||||||
|
test_utils.wait_for_condition(no_resource_leaks)
|
||||||
|
monitor_actor = test_utils.monitor_memory_usage()
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
# Run some long-running tasks
|
||||||
|
for n in num_tasks_arr:
|
||||||
|
print(f"\nRunning with many tasks={n}")
|
||||||
|
test_many_tasks(num_tasks=n)
|
||||||
|
print(f"\ntest_many_tasks({n}) PASS")
|
||||||
|
|
||||||
|
# Run many actors
|
||||||
|
for n in num_actors_arr:
|
||||||
|
print(f"\nRunning with many actors={n}")
|
||||||
|
test_many_actors(num_actors=n)
|
||||||
|
print(f"\ntest_many_actors({n}) PASS")
|
||||||
|
|
||||||
|
# Create many objects
|
||||||
|
for n in num_objects_arr:
|
||||||
|
print(f"\nRunning with many objects={n}")
|
||||||
|
test_many_objects(num_objects=n, num_actors=num_actors_for_objects)
|
||||||
|
print(f"\ntest_many_objects({n}) PASS")
|
||||||
|
|
||||||
|
# Create large logs
|
||||||
|
for n in log_file_size_arr:
|
||||||
|
print(f"\nRunning with large file={n} bytes")
|
||||||
|
test_large_log_file(log_file_size_byte=n)
|
||||||
|
print(f"\ntest_large_log_file({n} bytes) PASS")
|
||||||
|
|
||||||
|
print("\n\nPASS")
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
|
# Collect mem usage
|
||||||
|
ray.get(monitor_actor.stop_run.remote())
|
||||||
|
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
|
||||||
|
print(f"Peak memory usage: {round(used_gb, 2)}GB")
|
||||||
|
print(f"Peak memory usage per processes:\n {usage}")
|
||||||
|
del monitor_actor
|
||||||
|
|
||||||
|
state_perf_result = aggregate_perf_results()
|
||||||
|
results = {
|
||||||
|
"time": end_time - start_time,
|
||||||
|
"success": "1",
|
||||||
|
"_peak_memory": round(used_gb, 2),
|
||||||
|
"_peak_process_memory": usage,
|
||||||
|
"perf_metrics": [
|
||||||
|
{
|
||||||
|
"perf_metric_name": "avg_state_api_latency_sec",
|
||||||
|
"perf_metric_value": state_perf_result["avg_state_api_latency_sec"],
|
||||||
|
"perf_metric_type": "LATENCY",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
if "TEST_OUTPUT_JSON" in os.environ:
|
||||||
|
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
|
||||||
|
json.dump(results, out_file)
|
||||||
|
|
||||||
|
results.update(state_perf_result)
|
||||||
|
print(json.dumps(results, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
|
@ -0,0 +1,215 @@
|
||||||
|
import time
|
||||||
|
from typing import Dict, List
|
||||||
|
import click
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import ray
|
||||||
|
|
||||||
|
from ray.experimental.state.api import (
|
||||||
|
list_actors,
|
||||||
|
list_nodes,
|
||||||
|
list_objects,
|
||||||
|
list_tasks,
|
||||||
|
summarize_actors,
|
||||||
|
summarize_objects,
|
||||||
|
summarize_tasks,
|
||||||
|
)
|
||||||
|
|
||||||
|
import ray._private.test_utils as test_utils
|
||||||
|
|
||||||
|
from ray._private.state_api_test_utils import (
|
||||||
|
StateAPICallSpec,
|
||||||
|
periodic_invoke_state_apis_with_actor,
|
||||||
|
STATE_LIST_LIMIT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_release_test(test_file_path: str) -> None:
|
||||||
|
"""Download the release test file from github.
|
||||||
|
|
||||||
|
It is currently assumed individual release test is independent from each other,
|
||||||
|
and isolated in its own file.
|
||||||
|
|
||||||
|
This always downloads the file into current working directory so that this
|
||||||
|
python script could invoke the release test w/o path imports.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_file_path: File path relevant to the `/release` folder.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
Basename (file name) of the test file path if download successfully.
|
||||||
|
"""
|
||||||
|
import urllib.request as rq
|
||||||
|
import urllib.parse as parse
|
||||||
|
|
||||||
|
RAW_RAY_GITHUB_URL = (
|
||||||
|
"https://raw.githubusercontent.com/ray-project/ray/master/release/"
|
||||||
|
)
|
||||||
|
file_name = os.path.basename(test_file_path)
|
||||||
|
try:
|
||||||
|
rq.urlretrieve(parse.urljoin(RAW_RAY_GITHUB_URL, test_file_path), file_name)
|
||||||
|
return file_name
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to retrieve :{test_file_path} :\n{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_release_test(test_file_name: str) -> bool:
|
||||||
|
try:
|
||||||
|
os.remove(test_file_name)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to remove file: {test_file_name}: \n{e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_release_test_in_subprocess(test_file: str, args: List[str]) -> bool:
|
||||||
|
import subprocess as sp
|
||||||
|
|
||||||
|
# Run the test in subprocess
|
||||||
|
cmds = ["python", test_file, *args]
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmds)}")
|
||||||
|
proc = None
|
||||||
|
try:
|
||||||
|
proc = sp.run(cmds, check=True, text=True, capture_output=True)
|
||||||
|
proc.check_returncode()
|
||||||
|
return True
|
||||||
|
except sp.CalledProcessError as e:
|
||||||
|
print(f"Failed to run :{' '.join(cmds)}")
|
||||||
|
print(e)
|
||||||
|
print(e.stdout)
|
||||||
|
print(e.stderr)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def run_test(test_name: str, test_args: List[str]):
|
||||||
|
|
||||||
|
monitor_actor = test_utils.monitor_memory_usage()
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
|
run_release_test_in_subprocess(test_name, test_args)
|
||||||
|
end = time.perf_counter()
|
||||||
|
# Collect mem usage
|
||||||
|
ray.get(monitor_actor.stop_run.remote())
|
||||||
|
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"duration": end - start,
|
||||||
|
"peak_memory": round(used_gb, 2),
|
||||||
|
"peak_process_memory": usage,
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def run_test_with_state_api(
|
||||||
|
test_name: str,
|
||||||
|
test_args: List[str],
|
||||||
|
apis: List[StateAPICallSpec],
|
||||||
|
call_interval_s: int = 3,
|
||||||
|
print_interval_s: int = 15,
|
||||||
|
) -> Dict:
|
||||||
|
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
# Stage 1: Run with state APIs
|
||||||
|
api_caller = periodic_invoke_state_apis_with_actor(
|
||||||
|
apis=apis, call_interval_s=call_interval_s, print_interval_s=print_interval_s
|
||||||
|
)
|
||||||
|
|
||||||
|
stats_with_state_apis = run_test(test_name, test_args)
|
||||||
|
ray.get(api_caller.stop.remote())
|
||||||
|
print(json.dumps(ray.get(api_caller.get_stats.remote()), indent=2))
|
||||||
|
|
||||||
|
# Stage 2: Run without API generator
|
||||||
|
stats_without_state_apis = run_test(test_name, test_args)
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
|
# Dumping results
|
||||||
|
results = {
|
||||||
|
"time": end_time - start_time,
|
||||||
|
"success": "1",
|
||||||
|
"perf_metrics": [
|
||||||
|
{
|
||||||
|
"perf_metric_name": "state_api_extra_latency_sec",
|
||||||
|
"perf_metric_value": stats_with_state_apis["duration"]
|
||||||
|
- stats_without_state_apis["duration"],
|
||||||
|
"perf_metric_type": "LATENCY",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"perf_metric_name": "state_api_extra_latency_sec_percentage",
|
||||||
|
"perf_metric_value": (
|
||||||
|
stats_with_state_apis["duration"]
|
||||||
|
/ stats_without_state_apis["duration"]
|
||||||
|
- 1
|
||||||
|
)
|
||||||
|
* 100,
|
||||||
|
"perf_metric_type": "LATENCY",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"perf_metric_name": "state_api_extra_mem",
|
||||||
|
"perf_metric_value": stats_with_state_apis["peak_memory"]
|
||||||
|
- stats_without_state_apis["peak_memory"],
|
||||||
|
"perf_metric_type": "MEMORY",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.argument(
|
||||||
|
"test_path",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--test-args",
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--call-interval-s", type=int, default=3, help="interval of state api calls"
|
||||||
|
)
|
||||||
|
def test(
|
||||||
|
test_path,
|
||||||
|
test_args,
|
||||||
|
call_interval_s,
|
||||||
|
):
|
||||||
|
|
||||||
|
# Set up state API calling methods
|
||||||
|
def not_none(res):
|
||||||
|
return res is not None
|
||||||
|
|
||||||
|
apis = [
|
||||||
|
StateAPICallSpec(list_nodes, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||||
|
StateAPICallSpec(list_objects, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||||
|
StateAPICallSpec(list_tasks, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||||
|
StateAPICallSpec(list_actors, not_none, {"limit": STATE_LIST_LIMIT}),
|
||||||
|
StateAPICallSpec(summarize_tasks, not_none),
|
||||||
|
StateAPICallSpec(summarize_actors, not_none),
|
||||||
|
StateAPICallSpec(summarize_objects, not_none),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Set up benchmark test by downloading the release test file directly
|
||||||
|
test_name = download_release_test(test_path)
|
||||||
|
assert test_name is not None, f"Failed to retrieve release test: {test_path}"
|
||||||
|
|
||||||
|
ray.init()
|
||||||
|
results = run_test_with_state_api(
|
||||||
|
test_name,
|
||||||
|
test_args.split(),
|
||||||
|
apis,
|
||||||
|
call_interval_s=call_interval_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "TEST_OUTPUT_JSON" in os.environ:
|
||||||
|
# This will overwrite all other release tests result
|
||||||
|
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
|
||||||
|
json.dump(results, out_file)
|
||||||
|
print(json.dumps(results, indent=2))
|
||||||
|
|
||||||
|
assert cleanup_release_test(test_name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
|
@ -3572,6 +3572,59 @@
|
||||||
wait_for_nodes:
|
wait_for_nodes:
|
||||||
num_nodes: 5
|
num_nodes: 5
|
||||||
|
|
||||||
|
- name: stress_test_state_api_scale
|
||||||
|
group: core-daily-test
|
||||||
|
working_dir: nightly_tests
|
||||||
|
legacy:
|
||||||
|
test_name: stress_test_state_api_scale
|
||||||
|
test_suite: nightly_tests
|
||||||
|
stable: false
|
||||||
|
|
||||||
|
frequency: nightly
|
||||||
|
team: core
|
||||||
|
cluster:
|
||||||
|
cluster_env: stress_tests/stress_tests_app_config.yaml
|
||||||
|
cluster_compute: stress_tests/stress_tests_compute.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
timeout: 3600
|
||||||
|
script: python stress_tests/test_state_api_scale.py
|
||||||
|
type: sdk_command
|
||||||
|
file_manager: sdk
|
||||||
|
|
||||||
|
smoke_test:
|
||||||
|
frequency: multi
|
||||||
|
cluster:
|
||||||
|
app_config: stress_tests/stress_tests_app_config.yaml
|
||||||
|
cluster_compute: stress_tests/smoke_test_compute.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
timeout: 3600
|
||||||
|
script: python stress_tests/test_state_api_scale.py --smoke-test
|
||||||
|
|
||||||
|
|
||||||
|
- name: shuffle_20gb_with_state_api
|
||||||
|
group: core-daily-test
|
||||||
|
working_dir: nightly_tests
|
||||||
|
legacy:
|
||||||
|
test_name: shuffle_20gb_with_state_api
|
||||||
|
test_suite: nightly_tests
|
||||||
|
stable: false
|
||||||
|
|
||||||
|
frequency: nightly
|
||||||
|
team: core
|
||||||
|
cluster:
|
||||||
|
cluster_env: shuffle/shuffle_app_config.yaml
|
||||||
|
cluster_compute: shuffle/shuffle_compute_single.yaml
|
||||||
|
|
||||||
|
run:
|
||||||
|
timeout: 1000
|
||||||
|
script: python stress_tests/test_state_api_with_other_tests.py
|
||||||
|
nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
|
||||||
|
|
||||||
|
type: sdk_command
|
||||||
|
file_manager: sdk
|
||||||
|
|
||||||
- name: stress_test_many_tasks
|
- name: stress_test_many_tasks
|
||||||
group: core-daily-test
|
group: core-daily-test
|
||||||
working_dir: nightly_tests
|
working_dir: nightly_tests
|
||||||
|
|
Loading…
Add table
Reference in a new issue