[nightly] Add serve ha chaos test into nightly test. (#27413)

This PR adds a serve ha test. The flow of the tests is:

1. check the kube ray build
2. start ray service
3. warm up the cluster
4. start killing nodes
5. get the stats and make sure it's good
This commit is contained in:
Yi Cheng 2022-08-29 23:55:36 +00:00 committed by GitHub
parent 8934a8d32b
commit 4d91f516ca
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 782 additions and 13 deletions

View file

@ -20,3 +20,5 @@ post_build_cmds:
- curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
- sudo apt-get update && sudo apt-get install google-cloud-cli
- sudo apt-get install google-cloud-sdk-gke-gcloud-auth-plugin
- curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- helm repo add deliveryhero https://charts.deliveryhero.io/

View file

@ -22,8 +22,3 @@ aws:
Value: '{{env["ANYSCALE_USER"]}}'
- Key: anyscale-expiration
Value: '{{env["EXPIRATION_2D"]}}'
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 200

View file

@ -0,0 +1,19 @@
loadtest:
name: serve_ha_{cluster_id}
locust_host: "http://service-{cluster_id}-serve-svc:8000"
locust_locustfile_configmap: locusttest-{cluster_id}
loadtest.locust_lib_configmap: locusttest-{cluster_id}
locust_locustfile: locustfile.py
worker:
replicas: 4
master:
args:
- "--spawn-rate=20"
- "--users={users}"
- "--autostart"
- "--run-time={duration}s"
- "--csv=test_result_{cluster_id}"
- "--print-stats"
- "--csv-full-history"
- "--reset-stats"
- "--enable-rebalancing"

View file

@ -0,0 +1,33 @@
import locust
from locust import task, HttpUser
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
logger = logging.getLogger(__name__)
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, timeout, *args, **kwargs):
self.timeout = timeout
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None:
kwargs["timeout"] = self.timeout
return super().send(request, **kwargs)
class ServeTest(HttpUser):
wait_time = locust.wait_time.constant_throughput(10)
def on_start(self):
retries = Retry(
total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504]
)
self.client.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=1))
@task
def index(self):
self.client.get("/?val=3")

View file

@ -0,0 +1,309 @@
kind: ConfigMap
apiVersion: v1
metadata:
name: locusttest-{cluster_id}
data:
locustfile.py: |
{locustfile}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: script-{cluster_id}
data:
solution.py: |
{solution}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: redis-config-{cluster_id}
labels:
app: redis
data:
redis.conf: |-
dir /data
port 6379
bind 0.0.0.0
appendonly yes
protected-mode no
requirepass 5241590000000000
pidfile /data/redis-6379.pid
---
apiVersion: v1
kind: Service
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
type: ClusterIP
ports:
- name: redis
port: 6379
selector:
app: redis
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:5.0.8
command:
- "sh"
- "-c"
- "redis-server /usr/local/etc/redis/redis.conf"
ports:
- containerPort: 6379
volumeMounts:
- name: config
mountPath: /usr/local/etc/redis/redis.conf
subPath: redis.conf
volumes:
- name: config
configMap:
name: redis-config-{cluster_id}
---
apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
name: service-{cluster_id}
annotations:
ray.io/ft-enabled: "true"
spec:
serviceUnhealthySecondThreshold: 300
deploymentUnhealthySecondThreshold: 300
serveConfig:
importPath: solution.serve_entrypoint
runtimeEnv: |
env_vars:
PYTHONPATH: "/tmp/testing/"
deployments:
- name: a
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: b
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: c
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: d
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: e
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: DAGDriver
numReplicas: 6
rayActorOptions:
numCpus: 1
rayClusterConfig:
rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# the pod replicas in this group typed head (assuming there could be more than 1 in the future)
replicas: 1
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
rayStartParams:
port: '6379' # should match container port named gcs-server
object-store-memory: '100000000'
dashboard-host: '0.0.0.0'
num-cpus: '0' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
labels:
rayCluster: cluster-{cluster_id}
rayNodeType: head # will be injected if missing, must be head or wroker
groupName: headgroup # will be injected if missing
# annotations for pod
annotations:
key: value
spec:
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: ray-head
image: {ray_image}
imagePullPolicy: Always
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_REDIS_ADDRESS
value: redis-{cluster_id}:6379
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: RAY_num_heartbeats_timeout
value: "120"
- name: RAY_gcs_failover_worker_reconnect_timeout
value: "600"
resources:
limits:
cpu: 2
requests:
cpu: 2
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 12
minReplicas: 12
maxReplicas: 12
# logical group name, for this called small-group, also can be functional
groupName: small-group
# if worker pods need to be added, we can simply increment the replicas
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
# the operator will remove pods from the list until the number of replicas is satisfied
# when a pod is confirmed to be deleted, its name will be removed from the list below
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
rayStartParams:
node-ip-address: $MY_POD_IP
block: 'true'
num-cpus: '4' # can be auto-completed from the limits
#pod template
template:
metadata:
labels:
key: value
rayCluster: cluster-{cluster_id}
# annotations for pod
annotations:
key: value
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: {ray_image}
imagePullPolicy: Always
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
readinessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
# environment variables to set in the container.Optional.
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
env:
- name: RAY_DISABLE_DOCKER_CPU_WARNING
value: "1"
- name: TYPE
value: "worker"
- name: CPU_REQUEST
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.cpu
- name: CPU_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.cpu
- name: MEMORY_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.memory
- name: MEMORY_REQUESTS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.memory
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: RAY_num_heartbeats_timeout
value: "120"
- name: RAY_gcs_failover_worker_reconnect_timeout
value: "600"
- name: RAY_gcs_server_request_timeout_seconds
value: "5"
ports:
- containerPort: 80
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
requests:
cpu: "2"
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume

View file

@ -1,10 +1,371 @@
from kubernetes import client, config
import subprocess
from kubernetes import client, config, watch
import requests
import random
import uuid
import pathlib
import time
import ray
import os
# global variables for the cluster informations
cluster_id = str(uuid.uuid4()).split("-")[0]
ray_cluster_name = "cluster-" + cluster_id
ray_service_name = "service-" + cluster_id
locust_id = "ray-locust-" + cluster_id
if os.environ.get("RAY_IMAGE") is not None:
ray_image = os.environ.get("RAY_IMAGE")
elif ray.__version__ != "3.0.0.dev0":
ray_image = f"rayproject/ray:{ray.__version__}"
elif ray.__commit__ == "{{RAY_COMMIT_SHA}}":
ray_image = "rayproject/ray:nightly"
else:
ray_image = f"rayproject/ray:{ray.__commit__[:6]}"
# Configs can be set in Configuration class directly or using helper utility
config.load_kube_config()
cli = client.CoreV1Api()
v1 = client.CoreV1Api()
print("Listing pods with their IPs:")
ret = v1.list_pod_for_all_namespaces(watch=False)
for i in ret.items:
print("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
yaml_path = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml")
def check_kuberay_installed():
# Make sure the ray namespace exists
KUBERAY_VERSION = "v0.3.0"
uri = (
"github.com/ray-project/kuberay/manifests"
f"/base?ref={KUBERAY_VERSION}&timeout=90s"
)
print(
subprocess.check_output(
[
"kubectl",
"apply",
"-k",
uri,
]
).decode()
)
pods = subprocess.check_output(
["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"]
).decode()
assert pods.split("\n") != 0
def start_rayservice():
# step-1: generate the yaml file
print(f"Using ray image: {ray_image}")
solution = "\n".join(
[
f" {line}"
for line in pathlib.Path("./solution.py").read_text().splitlines()
]
)
locustfile = "\n".join(
[
f" {line}"
for line in pathlib.Path("./locustfile.py").read_text().splitlines()
]
)
template = (
pathlib.Path("ray_v1alpha1_rayservice_template.yaml")
.read_text()
.format(
cluster_id=cluster_id,
ray_image=ray_image,
solution=solution,
locustfile=locustfile,
)
)
print("=== YamlFile ===")
print(template)
tmp_yaml = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml")
tmp_yaml.write_text(template)
print("=== Get Pods from ray-system ===")
print(
subprocess.check_output(
["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"]
).decode()
)
# step-2: create the cluter
print(f"Creating cluster with id: {cluster_id}")
print(subprocess.check_output(["kubectl", "create", "-f", str(tmp_yaml)]).decode())
# step-3: make sure the ray cluster is up
w = watch.Watch()
start_time = time.time()
head_pod_name = None
for event in w.stream(
func=cli.list_namespaced_pod,
namespace="default",
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head",
timeout_seconds=60,
):
if event["object"].status.phase == "Running":
assert event["object"].kind == "Pod"
head_pod_name = event["object"].metadata.name
end_time = time.time()
print(f"{cluster_id} started in {end_time-start_time} sec")
print(f"head pod {head_pod_name}")
break
assert head_pod_name is not None
# step-4: e2e check it's alive
cmd = """
import requests
print(requests.get('http://localhost:8000/?val=123').text)
"""
while True:
try:
resp = (
subprocess.check_output(
f'kubectl exec {head_pod_name} -- python -c "{cmd}"', shell=True
)
.decode()
.strip()
)
if resp == "375":
print("Service is up now!")
break
else:
print(f"Failed with msg {resp}")
except Exception as e:
print("Error", e)
time.sleep(2)
def start_port_forward():
proc = None
proc = subprocess.Popen(
[
"kubectl",
"port-forward",
f"svc/{ray_service_name}-serve-svc",
"8000:8000",
"--address=0.0.0.0",
]
)
while True:
try:
resp = requests.get(
"http://localhost:8000/",
timeout=1,
params={
"val": 10,
},
)
if resp.status_code == 200:
print("The ray service is ready!!!")
break
except requests.exceptions.Timeout:
pass
except requests.exceptions.ConnectionError:
pass
print("Waiting for the proxy to be alive")
time.sleep(1)
return proc
def warmup_cluster(num_reqs):
for _ in range(num_reqs):
resp = requests.get(
"http://localhost:8000/",
timeout=1,
params={
"val": 10,
},
)
assert resp.status_code == 200
def start_sending_traffics(duration, users):
print("=== Install locust by helm ===")
yaml_config = (
pathlib.Path("locust-run.yaml")
.read_text()
.format(users=users, cluster_id=cluster_id, duration=int(duration))
)
print("=== Locust YAML ===")
print(yaml_config)
pathlib.Path("/tmp/locust-run-config.yaml").write_text(yaml_config)
helm_install_logs = subprocess.check_output(
[
"helm",
"install",
locust_id,
"deliveryhero/locust",
"-f",
"/tmp/locust-run-config.yaml",
]
)
print(helm_install_logs)
proc = subprocess.Popen(
[
"kubectl",
"port-forward",
f"svc/ray-locust-{cluster_id}",
"8080:8089",
"--address=0.0.0.0",
]
)
return proc
def dump_pods_actors(pod_name):
print(
subprocess.run(
f"kubectl exec {pod_name} -- ps -ef | grep ::",
shell=True,
capture_output=True,
).stdout.decode()
)
def kill_header():
pods = cli.list_namespaced_pod(
"default",
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head",
)
if pods.items[0].status.phase == "Running":
print(f"Killing header {pods.items[0].metadata.name}")
dump_pods_actors(pods.items[0].metadata.name)
cli.delete_namespaced_pod(pods.items[0].metadata.name, "default")
def kill_worker():
pods = cli.list_namespaced_pod(
"default",
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=worker",
)
alive_pods = [
(p.status.start_time, p.metadata.name)
for p in pods.items
if p.status.phase == "Running"
]
# sorted(alive_pods)
# We kill the oldest nodes for now given the memory leak in serve.
# to_be_killed = alive_pods[-1][1]
to_be_killed = random.choice(alive_pods)[1]
print(f"Killing worker {to_be_killed}")
dump_pods_actors(pods.items[0].metadata.name)
cli.delete_namespaced_pod(to_be_killed, "default")
def start_killing_nodes(duration, kill_interval, kill_head_every_n):
"""Kill the nodes in ray cluster.
duration: How long does we run the test (seconds)
kill_interval: The interval between two kills (seconds)
kill_head_every_n: For every n kills, we kill a head node
"""
for kill_idx in range(1, int(duration / kill_interval)):
while True:
try:
# kill
if kill_idx % kill_head_every_n == 0:
kill_header()
else:
kill_worker()
break
except Exception as e:
from time import sleep
print(f"Fail to kill node, retry in 5 seconds: {e}")
sleep(5)
time.sleep(kill_interval)
def get_stats():
labels = [
f"app.kubernetes.io/instance=ray-locust-{cluster_id}",
"app.kubernetes.io/name=locust,component=master",
]
pods = cli.list_namespaced_pod("default", label_selector=",".join(labels))
assert len(pods.items) == 1
pod_name = pods.items[0].metadata.name
subprocess.check_output(
[
"kubectl",
"cp",
f"{pod_name}:/home/locust/test_result_{cluster_id}_stats_history.csv",
"./stats_history.csv",
]
)
data = []
with open("stats_history.csv") as f:
import csv
reader = csv.reader(f)
for d in reader:
data.append(d)
# The first 5mins is for warming up
offset = 300
start_time = int(data[offset][0])
end_time = int(data[-1][0])
# 17 is the index for total requests
# 18 is the index for total failed requests
total = float(data[-1][17]) - float(data[offset][17])
failures = float(data[-1][18]) - float(data[offset][18])
# Available, through put
return (total - failures) / total, total / (end_time - start_time)
def main():
procs = []
try:
check_kuberay_installed()
start_rayservice()
procs.append(start_port_forward())
warmup_cluster(200)
users = 60
duration = 5 * 60 * 60
procs.append(start_sending_traffics(duration * 1.1, users))
start_killing_nodes(duration, 60, 6)
rate, qps = get_stats()
print("Result:", rate, qps)
assert rate > 0.9995
assert qps > users * 10 * 0.8
except Exception as e:
print("Experiment failed")
raise e
finally:
print("=== Cleanup ===")
subprocess.run(
["kubectl", "delete", "-f", str(yaml_path)],
capture_output=True,
)
subprocess.run(
["helm", "uninstall", locust_id],
capture_output=True,
)
print("Kill processes")
for p in procs:
p.kill()
if __name__ == "__main__":
try:
# Connect to ray so that the auto suspense
# will not start.
ray.init("auto")
except Exception:
# It doesnt' matter if it failed.
pass
main()

View file

@ -0,0 +1,50 @@
from ray import serve
from ray.serve.drivers import DAGDriver
from ray.dag.input_node import InputNode
"""
We are building a DAG like this:
A -> B ----> C
\-> D --/
\-> E -/
"""
@serve.deployment
def a(val: int):
return val
@serve.deployment
def b(val: int):
return val + 1
@serve.deployment
def c(v1: int, v2: int, v3: int):
return sum([v1, v2, v3])
@serve.deployment
def d(val):
return val + 2
@serve.deployment
def e(val):
return val + 3
with InputNode() as user_input:
oa = a.bind(user_input)
ob = b.bind(oa)
od = d.bind(oa)
oe = e.bind(oa)
oc = c.bind(ob, od, oe)
def input_adapter(val: int):
return val
serve_entrypoint = DAGDriver.bind(oc, http_adapter=input_adapter)

View file

@ -4643,7 +4643,7 @@
cluster_compute: compute_tpl.yaml
run:
timeout: 1800
timeout: 28800 # 8h
prepare: bash prepare.sh
script: python run_gcs_ft_on_k8s.py
type: sdk_command