2021-06-16 21:37:17 +01:00
|
|
|
import json
|
|
|
|
import os
|
2020-05-01 21:07:14 -05:00
|
|
|
import random
|
|
|
|
import string
|
|
|
|
import time
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import ray
|
|
|
|
from ray import serve
|
|
|
|
from ray.cluster_utils import Cluster
|
|
|
|
|
2021-06-28 10:01:55 -07:00
|
|
|
# Global variables / constants appear only right after imports.
|
|
|
|
# Ray serve deployment setup constants
|
|
|
|
NUM_REPLICAS = 7
|
|
|
|
MAX_BATCH_SIZE = 16
|
|
|
|
|
|
|
|
# Cluster setup constants
|
|
|
|
NUM_REDIS_SHARDS = 1
|
|
|
|
REDIS_MAX_MEMORY = 10**8
|
|
|
|
OBJECT_STORE_MEMORY = 10**8
|
|
|
|
NUM_NODES = 4
|
|
|
|
|
|
|
|
# RandomTest setup constants
|
|
|
|
CPUS_PER_NODE = 10
|
|
|
|
|
2021-11-16 08:12:08 -08:00
|
|
|
IS_SMOKE_TEST = "IS_SMOKE_TEST" in os.environ
|
|
|
|
|
2021-06-16 21:37:17 +01:00
|
|
|
|
|
|
|
def update_progress(result):
|
2021-06-28 10:01:55 -07:00
|
|
|
"""
|
|
|
|
Write test result json to /tmp/, which will be read from
|
|
|
|
anyscale product runs in each releaser test
|
|
|
|
"""
|
2021-06-16 21:37:17 +01:00
|
|
|
result["last_update"] = time.time()
|
|
|
|
test_output_json = os.environ.get("TEST_OUTPUT_JSON",
|
|
|
|
"/tmp/release_test_output.json")
|
|
|
|
with open(test_output_json, "wt") as f:
|
|
|
|
json.dump(result, f)
|
|
|
|
|
|
|
|
|
2020-05-01 21:07:14 -05:00
|
|
|
cluster = Cluster()
|
2021-06-28 10:01:55 -07:00
|
|
|
for i in range(NUM_NODES):
|
2020-05-01 21:07:14 -05:00
|
|
|
cluster.add_node(
|
|
|
|
redis_port=6379 if i == 0 else None,
|
2021-06-28 10:01:55 -07:00
|
|
|
num_redis_shards=NUM_REDIS_SHARDS if i == 0 else None,
|
2020-12-11 11:53:47 -08:00
|
|
|
num_cpus=16,
|
2020-05-01 21:07:14 -05:00
|
|
|
num_gpus=0,
|
|
|
|
resources={str(i): 2},
|
2021-06-28 10:01:55 -07:00
|
|
|
object_store_memory=OBJECT_STORE_MEMORY,
|
|
|
|
redis_max_memory=REDIS_MAX_MEMORY,
|
2020-12-11 11:53:47 -08:00
|
|
|
dashboard_host="0.0.0.0",
|
|
|
|
)
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2020-05-02 10:19:44 -05:00
|
|
|
ray.init(
|
2021-06-28 10:01:55 -07:00
|
|
|
namespace="serve_failure_test",
|
|
|
|
address=cluster.address,
|
|
|
|
dashboard_host="0.0.0.0",
|
2021-11-16 08:12:08 -08:00
|
|
|
log_to_driver=True,
|
|
|
|
)
|
2021-04-08 08:15:38 -05:00
|
|
|
serve.start(detached=True)
|
2020-05-01 21:07:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
@ray.remote
|
|
|
|
class RandomKiller:
|
2020-11-09 11:21:03 -06:00
|
|
|
def __init__(self, kill_period_s=1):
|
2020-05-01 21:07:14 -05:00
|
|
|
self.kill_period_s = kill_period_s
|
|
|
|
|
|
|
|
def _get_all_serve_actors(self):
|
2021-10-26 07:44:00 -07:00
|
|
|
controller = serve.api._get_global_client()._controller
|
2021-01-07 08:57:24 -08:00
|
|
|
routers = list(ray.get(controller.get_http_proxies.remote()).values())
|
2020-09-04 12:02:23 -05:00
|
|
|
all_handles = routers + [controller]
|
2021-10-26 07:44:00 -07:00
|
|
|
worker_handle_dict = ray.get(controller._all_running_replicas.remote())
|
|
|
|
for _, replica_info_list in worker_handle_dict.items():
|
|
|
|
for replica_info in replica_info_list:
|
|
|
|
all_handles.append(replica_info.actor_handle)
|
2020-05-01 21:07:14 -05:00
|
|
|
|
|
|
|
return all_handles
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
while True:
|
2021-11-11 19:19:34 -08:00
|
|
|
chosen = random.choice(self._get_all_serve_actors())
|
|
|
|
print(f"Killing {chosen}")
|
|
|
|
ray.kill(chosen, no_restart=False)
|
2020-05-01 21:07:14 -05:00
|
|
|
time.sleep(self.kill_period_s)
|
|
|
|
|
|
|
|
|
|
|
|
class RandomTest:
|
2021-04-08 08:15:38 -05:00
|
|
|
def __init__(self, max_deployments=1):
|
|
|
|
self.max_deployments = max_deployments
|
2020-05-01 21:07:14 -05:00
|
|
|
self.weighted_actions = [
|
2021-04-08 08:15:38 -05:00
|
|
|
(self.create_deployment, 1),
|
|
|
|
(self.verify_deployment, 4),
|
2020-05-01 21:07:14 -05:00
|
|
|
]
|
2021-04-08 08:15:38 -05:00
|
|
|
self.deployments = []
|
|
|
|
for _ in range(max_deployments):
|
|
|
|
self.create_deployment()
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
def create_deployment(self):
|
|
|
|
if len(self.deployments) == self.max_deployments:
|
|
|
|
deployment_to_delete = self.deployments.pop()
|
2021-06-28 10:01:55 -07:00
|
|
|
serve.get_deployment(deployment_to_delete).delete()
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
new_name = "".join(
|
2020-05-01 21:07:14 -05:00
|
|
|
[random.choice(string.ascii_letters) for _ in range(10)])
|
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
@serve.deployment(name=new_name)
|
2020-05-01 21:07:14 -05:00
|
|
|
def handler(self, *args):
|
2021-04-08 08:15:38 -05:00
|
|
|
return new_name
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
handler.deploy()
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
self.deployments.append(new_name)
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-04-08 08:15:38 -05:00
|
|
|
def verify_deployment(self):
|
|
|
|
deployment = random.choice(self.deployments)
|
2020-05-01 21:07:14 -05:00
|
|
|
for _ in range(100):
|
|
|
|
try:
|
2021-04-08 08:15:38 -05:00
|
|
|
r = requests.get("http://127.0.0.1:8000/" + deployment)
|
|
|
|
assert r.text == deployment
|
2020-05-01 21:07:14 -05:00
|
|
|
except Exception:
|
2021-04-08 08:15:38 -05:00
|
|
|
print("Request to {} failed.".format(deployment))
|
2020-05-01 21:07:14 -05:00
|
|
|
time.sleep(0.01)
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
iteration = 0
|
|
|
|
start_time = time.time()
|
|
|
|
previous_time = start_time
|
|
|
|
while True:
|
2021-11-16 08:12:08 -08:00
|
|
|
for _ in range(20):
|
2020-05-01 21:07:14 -05:00
|
|
|
actions, weights = zip(*self.weighted_actions)
|
2021-11-11 19:19:34 -08:00
|
|
|
action_chosen = random.choices(actions, weights=weights)[0]
|
|
|
|
print(f"Executing {action_chosen}")
|
|
|
|
action_chosen()
|
2020-05-01 21:07:14 -05:00
|
|
|
|
|
|
|
new_time = time.time()
|
|
|
|
print("Iteration {}:\n"
|
|
|
|
" - Iteration time: {}.\n"
|
|
|
|
" - Absolute time: {}.\n"
|
|
|
|
" - Total elapsed time: {}.".format(
|
|
|
|
iteration, new_time - previous_time, new_time,
|
|
|
|
new_time - start_time))
|
2021-06-16 21:37:17 +01:00
|
|
|
update_progress({
|
|
|
|
"iteration": iteration,
|
|
|
|
"iteration_time": new_time - previous_time,
|
|
|
|
"absolute_time": new_time,
|
|
|
|
"elapsed_time": new_time - start_time,
|
|
|
|
})
|
2021-08-05 01:53:01 -07:00
|
|
|
previous_time = new_time
|
|
|
|
iteration += 1
|
2021-06-16 21:37:17 +01:00
|
|
|
|
2021-11-16 08:12:08 -08:00
|
|
|
if IS_SMOKE_TEST:
|
|
|
|
break
|
|
|
|
|
2020-05-01 21:07:14 -05:00
|
|
|
|
2021-11-16 08:12:08 -08:00
|
|
|
tester = RandomTest(max_deployments=NUM_NODES * CPUS_PER_NODE)
|
2020-11-09 11:21:03 -06:00
|
|
|
random_killer = RandomKiller.remote()
|
2020-05-01 21:07:14 -05:00
|
|
|
random_killer.run.remote()
|
2021-11-16 08:12:08 -08:00
|
|
|
tester.run()
|