import subprocess from kubernetes import client, config, watch import requests import random import uuid import pathlib import time import ray import os # global variables for the cluster informations cluster_id = str(uuid.uuid4()).split("-")[0] ray_cluster_name = "cluster-" + cluster_id ray_service_name = "service-" + cluster_id locust_id = "ray-locust-" + cluster_id if os.environ.get("RAY_IMAGE") is not None: ray_image = os.environ.get("RAY_IMAGE") elif ray.__version__ != "3.0.0.dev0": ray_image = f"rayproject/ray:{ray.__version__}" elif ray.__commit__ == "{{RAY_COMMIT_SHA}}": ray_image = "rayproject/ray:nightly" else: ray_image = f"rayproject/ray:{ray.__commit__[:6]}" config.load_kube_config() cli = client.CoreV1Api() yaml_path = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml") def check_kuberay_installed(): # Make sure the ray namespace exists KUBERAY_VERSION = "v0.3.0" uri = ( "github.com/ray-project/kuberay/manifests" f"/base?ref={KUBERAY_VERSION}&timeout=90s" ) print( subprocess.check_output( [ "kubectl", "apply", "-k", uri, ] ).decode() ) pods = subprocess.check_output( ["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"] ).decode() assert pods.split("\n") != 0 def start_rayservice(): # step-1: generate the yaml file print(f"Using ray image: {ray_image}") solution = "\n".join( [ f" {line}" for line in pathlib.Path("./solution.py").read_text().splitlines() ] ) locustfile = "\n".join( [ f" {line}" for line in pathlib.Path("./locustfile.py").read_text().splitlines() ] ) template = ( pathlib.Path("ray_v1alpha1_rayservice_template.yaml") .read_text() .format( cluster_id=cluster_id, ray_image=ray_image, solution=solution, locustfile=locustfile, ) ) print("=== YamlFile ===") print(template) tmp_yaml = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml") tmp_yaml.write_text(template) print("=== Get Pods from ray-system ===") print( subprocess.check_output( ["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"] ).decode() ) # step-2: create the cluter print(f"Creating cluster with id: {cluster_id}") print(subprocess.check_output(["kubectl", "create", "-f", str(tmp_yaml)]).decode()) # step-3: make sure the ray cluster is up w = watch.Watch() start_time = time.time() head_pod_name = None for event in w.stream( func=cli.list_namespaced_pod, namespace="default", label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head", timeout_seconds=60, ): if event["object"].status.phase == "Running": assert event["object"].kind == "Pod" head_pod_name = event["object"].metadata.name end_time = time.time() print(f"{cluster_id} started in {end_time-start_time} sec") print(f"head pod {head_pod_name}") break assert head_pod_name is not None # step-4: e2e check it's alive cmd = """ import requests print(requests.get('http://localhost:8000/?val=123').text) """ while True: try: resp = ( subprocess.check_output( f'kubectl exec {head_pod_name} -- python -c "{cmd}"', shell=True ) .decode() .strip() ) if resp == "375": print("Service is up now!") break else: print(f"Failed with msg {resp}") except Exception as e: print("Error", e) time.sleep(2) def start_port_forward(): proc = None proc = subprocess.Popen( [ "kubectl", "port-forward", f"svc/{ray_service_name}-serve-svc", "8000:8000", "--address=0.0.0.0", ] ) while True: try: resp = requests.get( "http://localhost:8000/", timeout=1, params={ "val": 10, }, ) if resp.status_code == 200: print("The ray service is ready!!!") break except requests.exceptions.Timeout: pass except requests.exceptions.ConnectionError: pass print("Waiting for the proxy to be alive") time.sleep(1) return proc def warmup_cluster(num_reqs): for _ in range(num_reqs): resp = requests.get( "http://localhost:8000/", timeout=1, params={ "val": 10, }, ) assert resp.status_code == 200 def start_sending_traffics(duration, users): print("=== Install locust by helm ===") yaml_config = ( pathlib.Path("locust-run.yaml") .read_text() .format(users=users, cluster_id=cluster_id, duration=int(duration)) ) print("=== Locust YAML ===") print(yaml_config) pathlib.Path("/tmp/locust-run-config.yaml").write_text(yaml_config) helm_install_logs = subprocess.check_output( [ "helm", "install", locust_id, "deliveryhero/locust", "-f", "/tmp/locust-run-config.yaml", ] ) print(helm_install_logs) proc = subprocess.Popen( [ "kubectl", "port-forward", f"svc/ray-locust-{cluster_id}", "8080:8089", "--address=0.0.0.0", ] ) return proc def dump_pods_actors(pod_name): print( subprocess.run( f"kubectl exec {pod_name} -- ps -ef | grep ::", shell=True, capture_output=True, ).stdout.decode() ) def kill_header(): pods = cli.list_namespaced_pod( "default", label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head", ) if pods.items[0].status.phase == "Running": print(f"Killing header {pods.items[0].metadata.name}") dump_pods_actors(pods.items[0].metadata.name) cli.delete_namespaced_pod(pods.items[0].metadata.name, "default") def kill_worker(): pods = cli.list_namespaced_pod( "default", label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=worker", ) alive_pods = [ (p.status.start_time, p.metadata.name) for p in pods.items if p.status.phase == "Running" ] # sorted(alive_pods) # We kill the oldest nodes for now given the memory leak in serve. # to_be_killed = alive_pods[-1][1] to_be_killed = random.choice(alive_pods)[1] print(f"Killing worker {to_be_killed}") dump_pods_actors(pods.items[0].metadata.name) cli.delete_namespaced_pod(to_be_killed, "default") def start_killing_nodes(duration, kill_interval, kill_head_every_n): """Kill the nodes in ray cluster. duration: How long does we run the test (seconds) kill_interval: The interval between two kills (seconds) kill_head_every_n: For every n kills, we kill a head node """ for kill_idx in range(1, int(duration / kill_interval)): while True: try: # kill if kill_idx % kill_head_every_n == 0: kill_header() else: kill_worker() break except Exception as e: from time import sleep print(f"Fail to kill node, retry in 5 seconds: {e}") sleep(5) time.sleep(kill_interval) def get_stats(): labels = [ f"app.kubernetes.io/instance=ray-locust-{cluster_id}", "app.kubernetes.io/name=locust,component=master", ] pods = cli.list_namespaced_pod("default", label_selector=",".join(labels)) assert len(pods.items) == 1 pod_name = pods.items[0].metadata.name subprocess.check_output( [ "kubectl", "cp", f"{pod_name}:/home/locust/test_result_{cluster_id}_stats_history.csv", "./stats_history.csv", ] ) data = [] with open("stats_history.csv") as f: import csv reader = csv.reader(f) for d in reader: data.append(d) # The first 5mins is for warming up offset = 300 start_time = int(data[offset][0]) end_time = int(data[-1][0]) # 17 is the index for total requests # 18 is the index for total failed requests total = float(data[-1][17]) - float(data[offset][17]) failures = float(data[-1][18]) - float(data[offset][18]) # Available, through put return (total - failures) / total, total / (end_time - start_time) def main(): procs = [] try: check_kuberay_installed() start_rayservice() procs.append(start_port_forward()) warmup_cluster(200) users = 60 duration = 5 * 60 * 60 procs.append(start_sending_traffics(duration * 1.1, users)) start_killing_nodes(duration, 60, 6) rate, qps = get_stats() print("Result:", rate, qps) assert rate > 0.9995 assert qps > users * 10 * 0.8 except Exception as e: print("Experiment failed") raise e finally: print("=== Cleanup ===") subprocess.run( ["kubectl", "delete", "-f", str(yaml_path)], capture_output=True, ) subprocess.run( ["helm", "uninstall", locust_id], capture_output=True, ) print("Kill processes") for p in procs: p.kill() if __name__ == "__main__": try: # Connect to ray so that the auto suspense # will not start. ray.init("auto") except Exception: # It doesnt' matter if it failed. pass main()