[Core][State Observability] Nightly release test for state API (#26610)

* Initial

* Correctness test skeleton

* Added limit for listing

* Updated grpc config

* no more waiting

* metrics

* Updated constant and add test

* renamed

* actors

* actors

* actors

* dada

* actor dead?

* Script

* correct test name

* limit

* Added timeout

* release test /2

* Merged

* format+doc

* wip

Signed-off-by: rickyyx <ricky@anyscale.com>

* revert packag-lock

Signed-off-by: rickyyx <rickyx@anyscale.com>

* wip

* results

Signed-off-by: rickyx <rickyx@anyscale.com>

Signed-off-by: rickyyx <rickyx@anyscale.com>
Signed-off-by: rickyyx <ricky@anyscale.com>
Signed-off-by: rickyx <rickyx@anyscale.com>
Co-authored-by: rickyyx <ricky@anyscale.com>
This commit is contained in:
Ricky Xu 2022-08-11 07:01:01 -07:00 committed by GitHub
parent 0dceddb912
commit 5ea4747448
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 985 additions and 0 deletions

View file

@ -0,0 +1,294 @@
import asyncio
from copy import deepcopy
from collections import defaultdict
import concurrent.futures
from dataclasses import dataclass, field
import logging
import numpy as np
import pprint
import time
import traceback
from typing import Callable, Dict, List, Optional
import ray
from ray.actor import ActorHandle
@dataclass
class StateAPIMetric:
latency_sec: float
result_size: int
@dataclass
class StateAPICallSpec:
api: Callable
verify_cb: Callable
kwargs: Dict = field(default_factory=dict)
@dataclass
class StateAPIStats:
pending_calls: int = 0
total_calls: int = 0
calls: Dict = field(default_factory=lambda: defaultdict(list))
GLOBAL_STATE_STATS = StateAPIStats()
STATE_LIST_LIMIT = int(1e6) # 1m
STATE_LIST_TIMEOUT = 600 # 10min
def invoke_state_api(
verify_cb: Callable,
state_api_fn: Callable,
state_stats: StateAPIStats = GLOBAL_STATE_STATS,
key_suffix: Optional[str] = None,
**kwargs,
):
"""Invoke a State API
Args:
- verify_cb: Callback that takes in the response from `state_api_fn` and
returns a boolean, indicating the correctness of the results.
- state_api_fn: Function of the state API
- state_stats: Stats
- kwargs: Keyword arguments to be forwarded to the `state_api_fn`
"""
if "timeout" not in kwargs:
kwargs["timeout"] = STATE_LIST_TIMEOUT
# Suppress missing output warning
kwargs["raise_on_missing_output"] = False
res = None
try:
state_stats.total_calls += 1
state_stats.pending_calls += 1
t_start = time.perf_counter()
res = state_api_fn(**kwargs)
t_end = time.perf_counter()
metric = StateAPIMetric(t_end - t_start, len(res))
if key_suffix:
key = f"{state_api_fn.__name__}_{key_suffix}"
else:
key = state_api_fn.__name__
state_stats.calls[key].append(metric)
assert verify_cb(res), f"Calling State API failed. len(res)=({len(res)}): {res}"
except Exception as e:
traceback.print_exc()
assert (
False
), f"Calling {state_api_fn.__name__}({kwargs}) failed with {repr(e)}."
finally:
state_stats.pending_calls -= 1
return res
def aggregate_perf_results(state_stats: StateAPIStats = GLOBAL_STATE_STATS):
"""Aggregate stats of state API calls
Return:
This returns a dict of below fields:
- max_{api_key_name}_latency_sec:
Max latency of call to {api_key_name}
- {api_key_name}_result_size_with_max_latency:
The size of the result (or the number of bytes for get_log API)
for the max latency invocation
- avg/p99/p95/p50_{api_key_name}_latency_sec:
The percentile latency stats
- avg_state_api_latency_sec:
The average latency of all the state apis tracked
"""
# Prevent iteration when modifying error
state_stats = deepcopy(state_stats)
perf_result = {}
for api_key_name, metrics in state_stats.calls.items():
# Per api aggregation
# Max latency
latency_key = f"max_{api_key_name}_latency_sec"
size_key = f"{api_key_name}_result_size_with_max_latency"
metric = max(metrics, key=lambda metric: metric.latency_sec)
perf_result[latency_key] = metric.latency_sec
perf_result[size_key] = metric.result_size
latency_list = np.array([metric.latency_sec for metric in metrics])
# avg latency
key = f"avg_{api_key_name}_latency_sec"
perf_result[key] = np.average(latency_list)
# p99 latency
key = f"p99_{api_key_name}_latency_sec"
perf_result[key] = np.percentile(latency_list, 99)
# p95 latency
key = f"p95_{api_key_name}_latency_sec"
perf_result[key] = np.percentile(latency_list, 95)
# p50 latency
key = f"p50_{api_key_name}_latency_sec"
perf_result[key] = np.percentile(latency_list, 50)
all_state_api_latency = sum(
metric.latency_sec
for metric_samples in state_stats.calls.values()
for metric in metric_samples
)
perf_result["avg_state_api_latency_sec"] = (
(all_state_api_latency / state_stats.total_calls)
if state_stats.total_calls != 0
else -1
)
return perf_result
@ray.remote
class StateAPIGeneratorActor:
def __init__(
self,
apis: List[StateAPICallSpec],
call_interval_s: float = 5.0,
print_interval_s: float = 20.0,
wait_after_stop: bool = True,
) -> None:
"""An actor that periodically issues state API
Args:
- apis: List of StateAPICallSpec
- call_interval_s: State apis in the `apis` will be issued
every `call_interval_s` seconds.
- print_interval_s: How frequent state api stats will be dumped.
- wait_after_stop: When true, call to `ray.get(actor.stop.remote())`
will wait for all pending state APIs to return.
Setting it to `False` might miss some long-running state apis calls.
"""
# Configs
self._apis = apis
self._call_interval_s = call_interval_s
self._print_interval_s = print_interval_s
self._wait_after_cancel = wait_after_stop
self._logger = logging.getLogger(self.__class__.__name__)
# States
self._tasks = None
self._fut_queue = None
self._executor = None
self._loop = None
self._stopping = False
self._stopped = False
self._stats = StateAPIStats()
async def start(self):
# Run the periodic api generator
self._fut_queue = asyncio.Queue()
self._executor = concurrent.futures.ThreadPoolExecutor()
self._tasks = [
asyncio.ensure_future(awt)
for awt in [
self._run_generator(),
self._run_result_waiter(),
self._run_stats_reporter(),
]
]
await asyncio.gather(*self._tasks)
def call(self, fn, verify_cb, **kwargs):
def run_fn():
try:
self._logger.debug(f"calling {fn.__name__}({kwargs})")
return invoke_state_api(
verify_cb, fn, state_stats=self._stats, **kwargs
)
except Exception as e:
self._logger.warning(f"{fn.__name__}({kwargs}) failed with: {repr(e)}")
return None
fut = asyncio.get_running_loop().run_in_executor(self._executor, run_fn)
return fut
async def _run_stats_reporter(self):
while not self._stopped:
# Keep the reporter running until all pending apis finish and the bool
# `self._stopped` is then True
self._logger.info(pprint.pprint(aggregate_perf_results(self._stats)))
try:
await asyncio.sleep(self._print_interval_s)
except asyncio.CancelledError:
self._logger.info(
"_run_stats_reporter cancelled, "
f"waiting for all api {self._stats.pending_calls}calls to return..."
)
async def _run_generator(self):
try:
while not self._stopping:
# Run the state API in another thread
for api_spec in self._apis:
fut = self.call(api_spec.api, api_spec.verify_cb, **api_spec.kwargs)
self._fut_queue.put_nowait(fut)
await asyncio.sleep(self._call_interval_s)
except asyncio.CancelledError:
# Stop running
self._logger.info("_run_generator cancelled, now stopping...")
return
async def _run_result_waiter(self):
try:
while not self._stopping:
fut = await self._fut_queue.get()
await fut
except asyncio.CancelledError:
self._logger.info(
f"_run_result_waiter cancelled, cancelling {self._fut_queue.qsize()} "
"pending futures..."
)
while not self._fut_queue.empty():
fut = self._fut_queue.get_nowait()
if self._wait_after_cancel:
await fut
else:
# Ignore the queue futures if we are not
# waiting on them after stop() called
fut.cancel()
return
def get_stats(self):
# deep copy to prevent race between reporting and modifying stats
return aggregate_perf_results(self._stats)
def ready(self):
pass
def stop(self):
self._stopping = True
self._logger.debug(f"calling stop, canceling {len(self._tasks)} tasks")
for task in self._tasks:
task.cancel()
# This will block the stop() function until all futures are cancelled
# if _wait_after_cancel=True. When _wait_after_cancel=False, it will still
# wait for any in-progress futures.
# See: https://docs.python.org/3.8/library/concurrent.futures.html
self._executor.shutdown(wait=self._wait_after_cancel)
self._stopped = True
def periodic_invoke_state_apis_with_actor(*args, **kwargs) -> ActorHandle:
current_node_ip = ray._private.worker.global_worker.node_ip_address
# Schedule the actor on the current node.
actor = StateAPIGeneratorActor.options(
resources={f"node:{current_node_ip}": 0.001}
).remote(*args, **kwargs)
print("Waiting for state api actor to be ready...")
ray.get(actor.ready.remote())
print("State api actor is ready now.")
actor.start.remote()
return actor

View file

@ -0,0 +1,423 @@
import click
import json
import ray
from ray._private.ray_constants import LOG_PREFIX_ACTOR_NAME
from ray._private.state_api_test_utils import (
STATE_LIST_LIMIT,
StateAPIMetric,
aggregate_perf_results,
invoke_state_api,
GLOBAL_STATE_STATS,
)
import ray._private.test_utils as test_utils
import tqdm
import asyncio
import time
import os
from ray.experimental.state.api import (
get_log,
list_actors,
list_objects,
list_tasks,
)
GiB = 1024 * 1024 * 1024
MiB = 1024 * 1024
# We set num_cpus to zero because this actor will mostly just block on I/O.
@ray.remote(num_cpus=0)
class SignalActor:
def __init__(self):
self.ready_event = asyncio.Event()
def send(self, clear=False):
self.ready_event.set()
if clear:
self.ready_event.clear()
async def wait(self, should_wait=True):
if should_wait:
await self.ready_event.wait()
def invoke_state_api_n(*args, **kwargs):
NUM_API_CALL_SAMPLES = 10
for _ in range(NUM_API_CALL_SAMPLES):
invoke_state_api(*args, **kwargs)
def test_many_tasks(num_tasks: int):
if num_tasks == 0:
print("Skipping test with no tasks")
return
# No running tasks
invoke_state_api(
lambda res: len(res) == 0,
list_tasks,
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
key_suffix="0",
limit=STATE_LIST_LIMIT,
)
# Task definition adopted from:
# https://docs.ray.io/en/master/ray-core/examples/highly_parallel.html
from random import random
SAMPLES = 100
@ray.remote
def pi4_sample(signal):
in_count = 0
for _ in range(SAMPLES):
x, y = random(), random()
if x * x + y * y <= 1:
in_count += 1
# Block on signal
ray.get(signal.wait.remote())
return in_count
results = []
signal = SignalActor.remote()
for _ in tqdm.trange(num_tasks, desc="Launching tasks"):
results.append(pi4_sample.remote(signal))
invoke_state_api_n(
lambda res: len(res) == num_tasks,
list_tasks,
filters=[("name", "=", "pi4_sample()")],
key_suffix=f"{num_tasks}",
limit=STATE_LIST_LIMIT,
)
print("Waiting for tasks to finish...")
ray.get(signal.send.remote())
ray.get(results)
# Clean up
# All compute tasks done other than the signal actor
invoke_state_api(
lambda res: len(res) == 0,
list_tasks,
filters=[("name", "=", "pi4_sample()"), ("scheduling_state", "=", "RUNNING")],
key_suffix="0",
limit=STATE_LIST_LIMIT,
)
del signal
def test_many_actors(num_actors: int):
if num_actors == 0:
print("Skipping test with no actors")
return
@ray.remote
class TestActor:
def running(self):
return True
def exit(self):
ray.actor.exit_actor()
actor_class_name = TestActor.__ray_metadata__.class_name
invoke_state_api(
lambda res: len(res) == 0,
list_actors,
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
key_suffix="0",
limit=STATE_LIST_LIMIT,
)
actors = [
TestActor.remote() for _ in tqdm.trange(num_actors, desc="Launching actors...")
]
waiting_actors = [actor.running.remote() for actor in actors]
print("Waiting for actors to finish...")
ray.get(waiting_actors)
invoke_state_api_n(
lambda res: len(res) == num_actors,
list_actors,
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
key_suffix=f"{num_actors}",
limit=STATE_LIST_LIMIT,
)
exiting_actors = [actor.exit.remote() for actor in actors]
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
_exitted, exiting_actors = ray.wait(exiting_actors)
invoke_state_api(
lambda res: len(res) == 0,
list_actors,
filters=[("state", "=", "ALIVE"), ("class_name", "=", actor_class_name)],
key_suffix="0",
limit=STATE_LIST_LIMIT,
)
def test_many_objects(num_objects, num_actors):
if num_objects == 0:
print("Skipping test with no objects")
return
@ray.remote(num_cpus=0.1)
class ObjectActor:
def __init__(self):
self.objs = []
def create_objs(self, num_objects):
import os
for _ in range(num_objects):
# Object size shouldn't matter here.
self.objs.append(ray.put(bytearray(os.urandom(1024))))
return self.objs
def exit(self):
ray.actor.exit_actor()
actors = [
ObjectActor.remote() for _ in tqdm.trange(num_actors, desc="Creating actors...")
]
# Splitting objects to multiple actors for creation,
# credit: https://stackoverflow.com/a/2135920
def _split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
num_objs_per_actor = [len(objs) for objs in _split(range(num_objects), num_actors)]
waiting_actors = [
actor.create_objs.remote(num_objs)
for actor, num_objs in zip(actors, num_objs_per_actor)
]
total_objs_created = 0
for _ in tqdm.trange(num_actors, desc="Waiting actors to create objects..."):
objs, waiting_actors = ray.wait(waiting_actors)
total_objs_created += len(ray.get(*objs))
assert (
total_objs_created == num_objects
), "Expect correct number of objects created."
invoke_state_api_n(
lambda res: len(res) == num_objects,
list_objects,
filters=[
("reference_type", "=", "LOCAL_REFERENCE"),
("type", "=", "Worker"),
],
key_suffix=f"{num_objects}",
limit=STATE_LIST_LIMIT,
)
exiting_actors = [actor.exit.remote() for actor in actors]
for _ in tqdm.trange(len(actors), desc="Destroying actors..."):
_exitted, exiting_actors = ray.wait(exiting_actors)
def test_large_log_file(log_file_size_byte: int):
if log_file_size_byte == 0:
print("Skipping test with 0 log file size")
return
import sys
import string
import random
import hashlib
@ray.remote
class LogActor:
def write_log(self, log_file_size_byte: int):
ctx = hashlib.md5()
prefix = f"{LOG_PREFIX_ACTOR_NAME}LogActor\n"
ctx.update(prefix.encode())
while log_file_size_byte > 0:
n = min(log_file_size_byte, 4 * MiB)
chunk = "".join(random.choices(string.ascii_letters, k=n))
sys.stdout.writelines([chunk])
ctx.update(chunk.encode())
log_file_size_byte -= n
sys.stdout.flush()
return ctx.hexdigest(), ray.get_runtime_context().node_id.hex()
actor = LogActor.remote()
expected_hash, node_id = ray.get(
actor.write_log.remote(log_file_size_byte=log_file_size_byte)
)
assert expected_hash is not None, "Empty checksum from the log actor"
assert node_id is not None, "Empty node id from the log actor"
# Retrieve the log and compare the checksum
ctx = hashlib.md5()
time_taken = 0
t_start = time.perf_counter()
for s in get_log(actor_id=actor._actor_id.hex(), tail=-1):
t_end = time.perf_counter()
time_taken += t_end - t_start
# Not including this time
ctx.update(s.encode())
# Only time the iterator's performance
t_start = time.perf_counter()
assert expected_hash == ctx.hexdigest(), "Mismatch log file"
metric = StateAPIMetric(time_taken, log_file_size_byte)
GLOBAL_STATE_STATS.calls["get_log"].append(metric)
def _parse_input(
num_tasks_str: str, num_actors_str: str, num_objects_str: str, log_file_sizes: str
):
def _split_to_int(s):
tokens = s.split(",")
return [int(token) for token in tokens]
return (
_split_to_int(num_tasks_str),
_split_to_int(num_actors_str),
_split_to_int(num_objects_str),
_split_to_int(log_file_sizes),
)
def no_resource_leaks():
return test_utils.no_resource_leaks_excluding_node_resources()
@click.command()
@click.option(
"--num-tasks",
required=False,
default="1,100,1000,10000",
type=str,
help="Number of tasks to launch.",
)
@click.option(
"--num-actors",
required=False,
default="1,100,1000,5000",
type=str,
help="Number of actors to launch.",
)
@click.option(
"--num-objects",
required=False,
default="100,1000,10000,50000",
type=str,
help="Number of actors to launch.",
)
@click.option(
"--num-actors-for-objects",
required=False,
default=16,
type=int,
help="Number of actors to use for object creation.",
)
@click.option(
"--log-file-size-byte",
required=False,
default=f"{256*MiB},{1*GiB},{4*GiB}",
type=str,
help="Number of actors to launch.",
)
@click.option(
"--smoke-test",
is_flag=True,
type=bool,
default=False,
help="If set, it's a smoke test",
)
def test(
num_tasks,
num_actors,
num_objects,
num_actors_for_objects,
log_file_size_byte,
smoke_test,
):
ray.init(address="auto", log_to_driver=False)
if smoke_test:
num_tasks = "100"
num_actors = "10"
num_objects = "100"
log_file_size_byte = f"{16*MiB}"
# Parse the input
num_tasks_arr, num_actors_arr, num_objects_arr, log_file_size_arr = _parse_input(
num_tasks, num_actors, num_objects, log_file_size_byte
)
test_utils.wait_for_condition(no_resource_leaks)
monitor_actor = test_utils.monitor_memory_usage()
start_time = time.perf_counter()
# Run some long-running tasks
for n in num_tasks_arr:
print(f"\nRunning with many tasks={n}")
test_many_tasks(num_tasks=n)
print(f"\ntest_many_tasks({n}) PASS")
# Run many actors
for n in num_actors_arr:
print(f"\nRunning with many actors={n}")
test_many_actors(num_actors=n)
print(f"\ntest_many_actors({n}) PASS")
# Create many objects
for n in num_objects_arr:
print(f"\nRunning with many objects={n}")
test_many_objects(num_objects=n, num_actors=num_actors_for_objects)
print(f"\ntest_many_objects({n}) PASS")
# Create large logs
for n in log_file_size_arr:
print(f"\nRunning with large file={n} bytes")
test_large_log_file(log_file_size_byte=n)
print(f"\ntest_large_log_file({n} bytes) PASS")
print("\n\nPASS")
end_time = time.perf_counter()
# Collect mem usage
ray.get(monitor_actor.stop_run.remote())
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
print(f"Peak memory usage: {round(used_gb, 2)}GB")
print(f"Peak memory usage per processes:\n {usage}")
del monitor_actor
state_perf_result = aggregate_perf_results()
results = {
"time": end_time - start_time,
"success": "1",
"_peak_memory": round(used_gb, 2),
"_peak_process_memory": usage,
"perf_metrics": [
{
"perf_metric_name": "avg_state_api_latency_sec",
"perf_metric_value": state_perf_result["avg_state_api_latency_sec"],
"perf_metric_type": "LATENCY",
}
],
}
if "TEST_OUTPUT_JSON" in os.environ:
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
json.dump(results, out_file)
results.update(state_perf_result)
print(json.dumps(results, indent=2))
if __name__ == "__main__":
test()

View file

@ -0,0 +1,215 @@
import time
from typing import Dict, List
import click
import json
import os
import ray
from ray.experimental.state.api import (
list_actors,
list_nodes,
list_objects,
list_tasks,
summarize_actors,
summarize_objects,
summarize_tasks,
)
import ray._private.test_utils as test_utils
from ray._private.state_api_test_utils import (
StateAPICallSpec,
periodic_invoke_state_apis_with_actor,
STATE_LIST_LIMIT,
)
def download_release_test(test_file_path: str) -> None:
"""Download the release test file from github.
It is currently assumed individual release test is independent from each other,
and isolated in its own file.
This always downloads the file into current working directory so that this
python script could invoke the release test w/o path imports.
Args:
test_file_path: File path relevant to the `/release` folder.
Return:
Basename (file name) of the test file path if download successfully.
"""
import urllib.request as rq
import urllib.parse as parse
RAW_RAY_GITHUB_URL = (
"https://raw.githubusercontent.com/ray-project/ray/master/release/"
)
file_name = os.path.basename(test_file_path)
try:
rq.urlretrieve(parse.urljoin(RAW_RAY_GITHUB_URL, test_file_path), file_name)
return file_name
except Exception as e:
print(f"Failed to retrieve :{test_file_path} :\n{e}")
return None
def cleanup_release_test(test_file_name: str) -> bool:
try:
os.remove(test_file_name)
return True
except Exception as e:
print(f"Failed to remove file: {test_file_name}: \n{e}")
return False
def run_release_test_in_subprocess(test_file: str, args: List[str]) -> bool:
import subprocess as sp
# Run the test in subprocess
cmds = ["python", test_file, *args]
print(f"Running: {' '.join(cmds)}")
proc = None
try:
proc = sp.run(cmds, check=True, text=True, capture_output=True)
proc.check_returncode()
return True
except sp.CalledProcessError as e:
print(f"Failed to run :{' '.join(cmds)}")
print(e)
print(e.stdout)
print(e.stderr)
return False
def run_test(test_name: str, test_args: List[str]):
monitor_actor = test_utils.monitor_memory_usage()
start = time.perf_counter()
run_release_test_in_subprocess(test_name, test_args)
end = time.perf_counter()
# Collect mem usage
ray.get(monitor_actor.stop_run.remote())
used_gb, usage = ray.get(monitor_actor.get_peak_memory_info.remote())
results = {
"duration": end - start,
"peak_memory": round(used_gb, 2),
"peak_process_memory": usage,
}
return results
def run_test_with_state_api(
test_name: str,
test_args: List[str],
apis: List[StateAPICallSpec],
call_interval_s: int = 3,
print_interval_s: int = 15,
) -> Dict:
start_time = time.perf_counter()
# Stage 1: Run with state APIs
api_caller = periodic_invoke_state_apis_with_actor(
apis=apis, call_interval_s=call_interval_s, print_interval_s=print_interval_s
)
stats_with_state_apis = run_test(test_name, test_args)
ray.get(api_caller.stop.remote())
print(json.dumps(ray.get(api_caller.get_stats.remote()), indent=2))
# Stage 2: Run without API generator
stats_without_state_apis = run_test(test_name, test_args)
end_time = time.perf_counter()
# Dumping results
results = {
"time": end_time - start_time,
"success": "1",
"perf_metrics": [
{
"perf_metric_name": "state_api_extra_latency_sec",
"perf_metric_value": stats_with_state_apis["duration"]
- stats_without_state_apis["duration"],
"perf_metric_type": "LATENCY",
},
{
"perf_metric_name": "state_api_extra_latency_sec_percentage",
"perf_metric_value": (
stats_with_state_apis["duration"]
/ stats_without_state_apis["duration"]
- 1
)
* 100,
"perf_metric_type": "LATENCY",
},
{
"perf_metric_name": "state_api_extra_mem",
"perf_metric_value": stats_with_state_apis["peak_memory"]
- stats_without_state_apis["peak_memory"],
"perf_metric_type": "MEMORY",
},
],
}
return results
@click.command()
@click.argument(
"test_path",
)
@click.option(
"--test-args",
type=str,
)
@click.option(
"--call-interval-s", type=int, default=3, help="interval of state api calls"
)
def test(
test_path,
test_args,
call_interval_s,
):
# Set up state API calling methods
def not_none(res):
return res is not None
apis = [
StateAPICallSpec(list_nodes, not_none, {"limit": STATE_LIST_LIMIT}),
StateAPICallSpec(list_objects, not_none, {"limit": STATE_LIST_LIMIT}),
StateAPICallSpec(list_tasks, not_none, {"limit": STATE_LIST_LIMIT}),
StateAPICallSpec(list_actors, not_none, {"limit": STATE_LIST_LIMIT}),
StateAPICallSpec(summarize_tasks, not_none),
StateAPICallSpec(summarize_actors, not_none),
StateAPICallSpec(summarize_objects, not_none),
]
# Set up benchmark test by downloading the release test file directly
test_name = download_release_test(test_path)
assert test_name is not None, f"Failed to retrieve release test: {test_path}"
ray.init()
results = run_test_with_state_api(
test_name,
test_args.split(),
apis,
call_interval_s=call_interval_s,
)
if "TEST_OUTPUT_JSON" in os.environ:
# This will overwrite all other release tests result
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
json.dump(results, out_file)
print(json.dumps(results, indent=2))
assert cleanup_release_test(test_name)
if __name__ == "__main__":
test()

View file

@ -3572,6 +3572,59 @@
wait_for_nodes:
num_nodes: 5
- name: stress_test_state_api_scale
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: stress_test_state_api_scale
test_suite: nightly_tests
stable: false
frequency: nightly
team: core
cluster:
cluster_env: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/stress_tests_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_state_api_scale.py
type: sdk_command
file_manager: sdk
smoke_test:
frequency: multi
cluster:
app_config: stress_tests/stress_tests_app_config.yaml
cluster_compute: stress_tests/smoke_test_compute.yaml
run:
timeout: 3600
script: python stress_tests/test_state_api_scale.py --smoke-test
- name: shuffle_20gb_with_state_api
group: core-daily-test
working_dir: nightly_tests
legacy:
test_name: shuffle_20gb_with_state_api
test_suite: nightly_tests
stable: false
frequency: nightly
team: core
cluster:
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/shuffle_compute_single.yaml
run:
timeout: 1000
script: python stress_tests/test_state_api_with_other_tests.py
nightly_tests/shuffle/shuffle_test.py --test-args="--num-partitions=100 --partition-size=200e6"
type: sdk_command
file_manager: sdk
- name: stress_test_many_tasks
group: core-daily-test
working_dir: nightly_tests