mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[nightly] Add serve ha chaos test into nightly test. (#27413)
This PR adds a serve ha test. The flow of the tests is: 1. check the kube ray build 2. start ray service 3. warm up the cluster 4. start killing nodes 5. get the stats and make sure it's good
This commit is contained in:
parent
8934a8d32b
commit
4d91f516ca
8 changed files with 782 additions and 13 deletions
|
@ -20,3 +20,5 @@ post_build_cmds:
|
|||
- curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
|
||||
- sudo apt-get update && sudo apt-get install google-cloud-cli
|
||||
- sudo apt-get install google-cloud-sdk-gke-gcloud-auth-plugin
|
||||
- curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
- helm repo add deliveryhero https://charts.deliveryhero.io/
|
||||
|
|
|
@ -22,8 +22,3 @@ aws:
|
|||
Value: '{{env["ANYSCALE_USER"]}}'
|
||||
- Key: anyscale-expiration
|
||||
Value: '{{env["EXPIRATION_2D"]}}'
|
||||
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 200
|
19
release/k8s_tests/locust-run.yaml
Normal file
19
release/k8s_tests/locust-run.yaml
Normal file
|
@ -0,0 +1,19 @@
|
|||
loadtest:
|
||||
name: serve_ha_{cluster_id}
|
||||
locust_host: "http://service-{cluster_id}-serve-svc:8000"
|
||||
locust_locustfile_configmap: locusttest-{cluster_id}
|
||||
loadtest.locust_lib_configmap: locusttest-{cluster_id}
|
||||
locust_locustfile: locustfile.py
|
||||
worker:
|
||||
replicas: 4
|
||||
master:
|
||||
args:
|
||||
- "--spawn-rate=20"
|
||||
- "--users={users}"
|
||||
- "--autostart"
|
||||
- "--run-time={duration}s"
|
||||
- "--csv=test_result_{cluster_id}"
|
||||
- "--print-stats"
|
||||
- "--csv-full-history"
|
||||
- "--reset-stats"
|
||||
- "--enable-rebalancing"
|
33
release/k8s_tests/locustfile.py
Normal file
33
release/k8s_tests/locustfile.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import locust
|
||||
from locust import task, HttpUser
|
||||
import logging
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TimeoutHTTPAdapter(HTTPAdapter):
|
||||
def __init__(self, timeout, *args, **kwargs):
|
||||
self.timeout = timeout
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def send(self, request, **kwargs):
|
||||
timeout = kwargs.get("timeout")
|
||||
if timeout is None:
|
||||
kwargs["timeout"] = self.timeout
|
||||
return super().send(request, **kwargs)
|
||||
|
||||
|
||||
class ServeTest(HttpUser):
|
||||
wait_time = locust.wait_time.constant_throughput(10)
|
||||
|
||||
def on_start(self):
|
||||
retries = Retry(
|
||||
total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504]
|
||||
)
|
||||
self.client.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=1))
|
||||
|
||||
@task
|
||||
def index(self):
|
||||
self.client.get("/?val=3")
|
309
release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
Normal file
309
release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
Normal file
|
@ -0,0 +1,309 @@
|
|||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: locusttest-{cluster_id}
|
||||
data:
|
||||
locustfile.py: |
|
||||
{locustfile}
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: script-{cluster_id}
|
||||
data:
|
||||
solution.py: |
|
||||
{solution}
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: redis-config-{cluster_id}
|
||||
labels:
|
||||
app: redis
|
||||
data:
|
||||
redis.conf: |-
|
||||
dir /data
|
||||
port 6379
|
||||
bind 0.0.0.0
|
||||
appendonly yes
|
||||
protected-mode no
|
||||
requirepass 5241590000000000
|
||||
pidfile /data/redis-6379.pid
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: redis-{cluster_id}
|
||||
labels:
|
||||
app: redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: redis
|
||||
port: 6379
|
||||
selector:
|
||||
app: redis
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: redis-{cluster_id}
|
||||
labels:
|
||||
app: redis
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: redis
|
||||
spec:
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:5.0.8
|
||||
command:
|
||||
- "sh"
|
||||
- "-c"
|
||||
- "redis-server /usr/local/etc/redis/redis.conf"
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /usr/local/etc/redis/redis.conf
|
||||
subPath: redis.conf
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: redis-config-{cluster_id}
|
||||
---
|
||||
apiVersion: ray.io/v1alpha1
|
||||
kind: RayService
|
||||
metadata:
|
||||
name: service-{cluster_id}
|
||||
annotations:
|
||||
ray.io/ft-enabled: "true"
|
||||
spec:
|
||||
serviceUnhealthySecondThreshold: 300
|
||||
deploymentUnhealthySecondThreshold: 300
|
||||
serveConfig:
|
||||
importPath: solution.serve_entrypoint
|
||||
runtimeEnv: |
|
||||
env_vars:
|
||||
PYTHONPATH: "/tmp/testing/"
|
||||
deployments:
|
||||
- name: a
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
- name: b
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
- name: c
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
- name: d
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
- name: e
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
- name: DAGDriver
|
||||
numReplicas: 6
|
||||
rayActorOptions:
|
||||
numCpus: 1
|
||||
rayClusterConfig:
|
||||
rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
|
||||
######################headGroupSpecs#################################
|
||||
# head group template and specs, (perhaps 'group' is not needed in the name)
|
||||
headGroupSpec:
|
||||
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
||||
serviceType: ClusterIP
|
||||
# the pod replicas in this group typed head (assuming there could be more than 1 in the future)
|
||||
replicas: 1
|
||||
# logical group name, for this called head-group, also can be functional
|
||||
# pod type head or worker
|
||||
# rayNodeType: head # Not needed since it is under the headgroup
|
||||
# the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
|
||||
rayStartParams:
|
||||
port: '6379' # should match container port named gcs-server
|
||||
object-store-memory: '100000000'
|
||||
dashboard-host: '0.0.0.0'
|
||||
num-cpus: '0' # can be auto-completed from the limits
|
||||
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
|
||||
block: 'true'
|
||||
#pod template
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
rayCluster: cluster-{cluster_id}
|
||||
rayNodeType: head # will be injected if missing, must be head or wroker
|
||||
groupName: headgroup # will be injected if missing
|
||||
# annotations for pod
|
||||
annotations:
|
||||
key: value
|
||||
spec:
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: script-{cluster_id}
|
||||
- name: log-volume
|
||||
emptyDir: {{}}
|
||||
containers:
|
||||
- name: ray-head
|
||||
image: {ray_image}
|
||||
imagePullPolicy: Always
|
||||
env:
|
||||
- name: MY_POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.podIP
|
||||
- name: RAY_REDIS_ADDRESS
|
||||
value: redis-{cluster_id}:6379
|
||||
- name: RAY_gcs_rpc_server_reconnect_timeout_s
|
||||
value: "600"
|
||||
- name: RAY_num_heartbeats_timeout
|
||||
value: "120"
|
||||
- name: RAY_gcs_failover_worker_reconnect_timeout
|
||||
value: "600"
|
||||
resources:
|
||||
limits:
|
||||
cpu: 2
|
||||
requests:
|
||||
cpu: 2
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: gcs-server
|
||||
- containerPort: 8265 # Ray dashboard
|
||||
name: dashboard
|
||||
- containerPort: 10001
|
||||
name: client
|
||||
- containerPort: 8000
|
||||
name: serve
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /tmp/testing/solution.py
|
||||
subPath: solution.py
|
||||
- mountPath: /tmp/ray/
|
||||
name: log-volume
|
||||
workerGroupSpecs:
|
||||
# the pod replicas in this group typed worker
|
||||
- replicas: 12
|
||||
minReplicas: 12
|
||||
maxReplicas: 12
|
||||
# logical group name, for this called small-group, also can be functional
|
||||
groupName: small-group
|
||||
# if worker pods need to be added, we can simply increment the replicas
|
||||
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
|
||||
# the operator will remove pods from the list until the number of replicas is satisfied
|
||||
# when a pod is confirmed to be deleted, its name will be removed from the list below
|
||||
#scaleStrategy:
|
||||
# workersToDelete:
|
||||
# - raycluster-complete-worker-small-group-bdtwh
|
||||
# - raycluster-complete-worker-small-group-hv457
|
||||
# - raycluster-complete-worker-small-group-k8tj7
|
||||
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
|
||||
rayStartParams:
|
||||
node-ip-address: $MY_POD_IP
|
||||
block: 'true'
|
||||
num-cpus: '4' # can be auto-completed from the limits
|
||||
#pod template
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
key: value
|
||||
rayCluster: cluster-{cluster_id}
|
||||
# annotations for pod
|
||||
annotations:
|
||||
key: value
|
||||
spec:
|
||||
initContainers:
|
||||
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
|
||||
- name: init-myservice
|
||||
image: busybox:1.28
|
||||
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
||||
volumes:
|
||||
- name: script
|
||||
configMap:
|
||||
name: script-{cluster_id}
|
||||
- name: log-volume
|
||||
emptyDir: {{}}
|
||||
containers:
|
||||
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
|
||||
image: {ray_image}
|
||||
imagePullPolicy: Always
|
||||
livenessProbe:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 10
|
||||
readinessProbe:
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 5
|
||||
timeoutSeconds: 10
|
||||
|
||||
# environment variables to set in the container.Optional.
|
||||
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
|
||||
env:
|
||||
- name: RAY_DISABLE_DOCKER_CPU_WARNING
|
||||
value: "1"
|
||||
- name: TYPE
|
||||
value: "worker"
|
||||
- name: CPU_REQUEST
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
containerName: machine-learning
|
||||
resource: requests.cpu
|
||||
- name: CPU_LIMITS
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
containerName: machine-learning
|
||||
resource: limits.cpu
|
||||
- name: MEMORY_LIMITS
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
containerName: machine-learning
|
||||
resource: limits.memory
|
||||
- name: MEMORY_REQUESTS
|
||||
valueFrom:
|
||||
resourceFieldRef:
|
||||
containerName: machine-learning
|
||||
resource: requests.memory
|
||||
- name: MY_POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: MY_POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: status.podIP
|
||||
- name: RAY_gcs_rpc_server_reconnect_timeout_s
|
||||
value: "600"
|
||||
- name: RAY_num_heartbeats_timeout
|
||||
value: "120"
|
||||
- name: RAY_gcs_failover_worker_reconnect_timeout
|
||||
value: "600"
|
||||
- name: RAY_gcs_server_request_timeout_seconds
|
||||
value: "5"
|
||||
ports:
|
||||
- containerPort: 80
|
||||
name: client
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh","-c","ray stop"]
|
||||
resources:
|
||||
limits:
|
||||
cpu: "2"
|
||||
requests:
|
||||
cpu: "2"
|
||||
volumeMounts:
|
||||
- name: script
|
||||
mountPath: /tmp/testing/solution.py
|
||||
subPath: solution.py
|
||||
- mountPath: /tmp/ray/
|
||||
name: log-volume
|
|
@ -1,10 +1,371 @@
|
|||
from kubernetes import client, config
|
||||
import subprocess
|
||||
from kubernetes import client, config, watch
|
||||
import requests
|
||||
import random
|
||||
import uuid
|
||||
import pathlib
|
||||
import time
|
||||
import ray
|
||||
import os
|
||||
|
||||
# global variables for the cluster informations
|
||||
cluster_id = str(uuid.uuid4()).split("-")[0]
|
||||
ray_cluster_name = "cluster-" + cluster_id
|
||||
ray_service_name = "service-" + cluster_id
|
||||
locust_id = "ray-locust-" + cluster_id
|
||||
|
||||
|
||||
if os.environ.get("RAY_IMAGE") is not None:
|
||||
ray_image = os.environ.get("RAY_IMAGE")
|
||||
elif ray.__version__ != "3.0.0.dev0":
|
||||
ray_image = f"rayproject/ray:{ray.__version__}"
|
||||
elif ray.__commit__ == "{{RAY_COMMIT_SHA}}":
|
||||
ray_image = "rayproject/ray:nightly"
|
||||
else:
|
||||
ray_image = f"rayproject/ray:{ray.__commit__[:6]}"
|
||||
|
||||
# Configs can be set in Configuration class directly or using helper utility
|
||||
config.load_kube_config()
|
||||
cli = client.CoreV1Api()
|
||||
|
||||
v1 = client.CoreV1Api()
|
||||
print("Listing pods with their IPs:")
|
||||
ret = v1.list_pod_for_all_namespaces(watch=False)
|
||||
for i in ret.items:
|
||||
print("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
|
||||
yaml_path = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml")
|
||||
|
||||
|
||||
def check_kuberay_installed():
|
||||
# Make sure the ray namespace exists
|
||||
KUBERAY_VERSION = "v0.3.0"
|
||||
uri = (
|
||||
"github.com/ray-project/kuberay/manifests"
|
||||
f"/base?ref={KUBERAY_VERSION}&timeout=90s"
|
||||
)
|
||||
print(
|
||||
subprocess.check_output(
|
||||
[
|
||||
"kubectl",
|
||||
"apply",
|
||||
"-k",
|
||||
uri,
|
||||
]
|
||||
).decode()
|
||||
)
|
||||
pods = subprocess.check_output(
|
||||
["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"]
|
||||
).decode()
|
||||
assert pods.split("\n") != 0
|
||||
|
||||
|
||||
def start_rayservice():
|
||||
# step-1: generate the yaml file
|
||||
print(f"Using ray image: {ray_image}")
|
||||
solution = "\n".join(
|
||||
[
|
||||
f" {line}"
|
||||
for line in pathlib.Path("./solution.py").read_text().splitlines()
|
||||
]
|
||||
)
|
||||
locustfile = "\n".join(
|
||||
[
|
||||
f" {line}"
|
||||
for line in pathlib.Path("./locustfile.py").read_text().splitlines()
|
||||
]
|
||||
)
|
||||
template = (
|
||||
pathlib.Path("ray_v1alpha1_rayservice_template.yaml")
|
||||
.read_text()
|
||||
.format(
|
||||
cluster_id=cluster_id,
|
||||
ray_image=ray_image,
|
||||
solution=solution,
|
||||
locustfile=locustfile,
|
||||
)
|
||||
)
|
||||
|
||||
print("=== YamlFile ===")
|
||||
print(template)
|
||||
tmp_yaml = pathlib.Path("/tmp/ray_v1alpha1_rayservice.yaml")
|
||||
tmp_yaml.write_text(template)
|
||||
|
||||
print("=== Get Pods from ray-system ===")
|
||||
print(
|
||||
subprocess.check_output(
|
||||
["kubectl", "get", "pods", "--namespace", "ray-system", "--no-headers"]
|
||||
).decode()
|
||||
)
|
||||
|
||||
# step-2: create the cluter
|
||||
print(f"Creating cluster with id: {cluster_id}")
|
||||
print(subprocess.check_output(["kubectl", "create", "-f", str(tmp_yaml)]).decode())
|
||||
|
||||
# step-3: make sure the ray cluster is up
|
||||
w = watch.Watch()
|
||||
start_time = time.time()
|
||||
head_pod_name = None
|
||||
for event in w.stream(
|
||||
func=cli.list_namespaced_pod,
|
||||
namespace="default",
|
||||
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head",
|
||||
timeout_seconds=60,
|
||||
):
|
||||
if event["object"].status.phase == "Running":
|
||||
assert event["object"].kind == "Pod"
|
||||
head_pod_name = event["object"].metadata.name
|
||||
end_time = time.time()
|
||||
print(f"{cluster_id} started in {end_time-start_time} sec")
|
||||
print(f"head pod {head_pod_name}")
|
||||
break
|
||||
assert head_pod_name is not None
|
||||
# step-4: e2e check it's alive
|
||||
cmd = """
|
||||
import requests
|
||||
print(requests.get('http://localhost:8000/?val=123').text)
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
resp = (
|
||||
subprocess.check_output(
|
||||
f'kubectl exec {head_pod_name} -- python -c "{cmd}"', shell=True
|
||||
)
|
||||
.decode()
|
||||
.strip()
|
||||
)
|
||||
if resp == "375":
|
||||
print("Service is up now!")
|
||||
break
|
||||
else:
|
||||
print(f"Failed with msg {resp}")
|
||||
except Exception as e:
|
||||
print("Error", e)
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def start_port_forward():
|
||||
proc = None
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"kubectl",
|
||||
"port-forward",
|
||||
f"svc/{ray_service_name}-serve-svc",
|
||||
"8000:8000",
|
||||
"--address=0.0.0.0",
|
||||
]
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
resp = requests.get(
|
||||
"http://localhost:8000/",
|
||||
timeout=1,
|
||||
params={
|
||||
"val": 10,
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
print("The ray service is ready!!!")
|
||||
break
|
||||
except requests.exceptions.Timeout:
|
||||
pass
|
||||
except requests.exceptions.ConnectionError:
|
||||
pass
|
||||
|
||||
print("Waiting for the proxy to be alive")
|
||||
time.sleep(1)
|
||||
|
||||
return proc
|
||||
|
||||
|
||||
def warmup_cluster(num_reqs):
|
||||
for _ in range(num_reqs):
|
||||
resp = requests.get(
|
||||
"http://localhost:8000/",
|
||||
timeout=1,
|
||||
params={
|
||||
"val": 10,
|
||||
},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def start_sending_traffics(duration, users):
|
||||
print("=== Install locust by helm ===")
|
||||
yaml_config = (
|
||||
pathlib.Path("locust-run.yaml")
|
||||
.read_text()
|
||||
.format(users=users, cluster_id=cluster_id, duration=int(duration))
|
||||
)
|
||||
print("=== Locust YAML ===")
|
||||
print(yaml_config)
|
||||
|
||||
pathlib.Path("/tmp/locust-run-config.yaml").write_text(yaml_config)
|
||||
helm_install_logs = subprocess.check_output(
|
||||
[
|
||||
"helm",
|
||||
"install",
|
||||
locust_id,
|
||||
"deliveryhero/locust",
|
||||
"-f",
|
||||
"/tmp/locust-run-config.yaml",
|
||||
]
|
||||
)
|
||||
print(helm_install_logs)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"kubectl",
|
||||
"port-forward",
|
||||
f"svc/ray-locust-{cluster_id}",
|
||||
"8080:8089",
|
||||
"--address=0.0.0.0",
|
||||
]
|
||||
)
|
||||
return proc
|
||||
|
||||
|
||||
def dump_pods_actors(pod_name):
|
||||
print(
|
||||
subprocess.run(
|
||||
f"kubectl exec {pod_name} -- ps -ef | grep ::",
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
).stdout.decode()
|
||||
)
|
||||
|
||||
|
||||
def kill_header():
|
||||
pods = cli.list_namespaced_pod(
|
||||
"default",
|
||||
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=head",
|
||||
)
|
||||
if pods.items[0].status.phase == "Running":
|
||||
print(f"Killing header {pods.items[0].metadata.name}")
|
||||
dump_pods_actors(pods.items[0].metadata.name)
|
||||
cli.delete_namespaced_pod(pods.items[0].metadata.name, "default")
|
||||
|
||||
|
||||
def kill_worker():
|
||||
pods = cli.list_namespaced_pod(
|
||||
"default",
|
||||
label_selector=f"rayCluster={ray_cluster_name},ray.io/node-type=worker",
|
||||
)
|
||||
alive_pods = [
|
||||
(p.status.start_time, p.metadata.name)
|
||||
for p in pods.items
|
||||
if p.status.phase == "Running"
|
||||
]
|
||||
# sorted(alive_pods)
|
||||
# We kill the oldest nodes for now given the memory leak in serve.
|
||||
# to_be_killed = alive_pods[-1][1]
|
||||
|
||||
to_be_killed = random.choice(alive_pods)[1]
|
||||
print(f"Killing worker {to_be_killed}")
|
||||
dump_pods_actors(pods.items[0].metadata.name)
|
||||
cli.delete_namespaced_pod(to_be_killed, "default")
|
||||
|
||||
|
||||
def start_killing_nodes(duration, kill_interval, kill_head_every_n):
|
||||
"""Kill the nodes in ray cluster.
|
||||
|
||||
duration: How long does we run the test (seconds)
|
||||
kill_interval: The interval between two kills (seconds)
|
||||
kill_head_every_n: For every n kills, we kill a head node
|
||||
"""
|
||||
|
||||
for kill_idx in range(1, int(duration / kill_interval)):
|
||||
while True:
|
||||
try:
|
||||
# kill
|
||||
if kill_idx % kill_head_every_n == 0:
|
||||
kill_header()
|
||||
else:
|
||||
kill_worker()
|
||||
break
|
||||
except Exception as e:
|
||||
from time import sleep
|
||||
|
||||
print(f"Fail to kill node, retry in 5 seconds: {e}")
|
||||
sleep(5)
|
||||
|
||||
time.sleep(kill_interval)
|
||||
|
||||
|
||||
def get_stats():
|
||||
labels = [
|
||||
f"app.kubernetes.io/instance=ray-locust-{cluster_id}",
|
||||
"app.kubernetes.io/name=locust,component=master",
|
||||
]
|
||||
pods = cli.list_namespaced_pod("default", label_selector=",".join(labels))
|
||||
assert len(pods.items) == 1
|
||||
pod_name = pods.items[0].metadata.name
|
||||
subprocess.check_output(
|
||||
[
|
||||
"kubectl",
|
||||
"cp",
|
||||
f"{pod_name}:/home/locust/test_result_{cluster_id}_stats_history.csv",
|
||||
"./stats_history.csv",
|
||||
]
|
||||
)
|
||||
data = []
|
||||
with open("stats_history.csv") as f:
|
||||
import csv
|
||||
|
||||
reader = csv.reader(f)
|
||||
for d in reader:
|
||||
data.append(d)
|
||||
# The first 5mins is for warming up
|
||||
offset = 300
|
||||
start_time = int(data[offset][0])
|
||||
end_time = int(data[-1][0])
|
||||
# 17 is the index for total requests
|
||||
# 18 is the index for total failed requests
|
||||
total = float(data[-1][17]) - float(data[offset][17])
|
||||
failures = float(data[-1][18]) - float(data[offset][18])
|
||||
|
||||
# Available, through put
|
||||
return (total - failures) / total, total / (end_time - start_time)
|
||||
|
||||
|
||||
def main():
|
||||
procs = []
|
||||
try:
|
||||
check_kuberay_installed()
|
||||
start_rayservice()
|
||||
procs.append(start_port_forward())
|
||||
warmup_cluster(200)
|
||||
users = 60
|
||||
duration = 5 * 60 * 60
|
||||
procs.append(start_sending_traffics(duration * 1.1, users))
|
||||
start_killing_nodes(duration, 60, 6)
|
||||
rate, qps = get_stats()
|
||||
|
||||
print("Result:", rate, qps)
|
||||
assert rate > 0.9995
|
||||
assert qps > users * 10 * 0.8
|
||||
|
||||
except Exception as e:
|
||||
print("Experiment failed")
|
||||
raise e
|
||||
finally:
|
||||
print("=== Cleanup ===")
|
||||
|
||||
subprocess.run(
|
||||
["kubectl", "delete", "-f", str(yaml_path)],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
["helm", "uninstall", locust_id],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
print("Kill processes")
|
||||
for p in procs:
|
||||
p.kill()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
# Connect to ray so that the auto suspense
|
||||
# will not start.
|
||||
ray.init("auto")
|
||||
except Exception:
|
||||
# It doesnt' matter if it failed.
|
||||
pass
|
||||
main()
|
||||
|
|
50
release/k8s_tests/solution.py
Normal file
50
release/k8s_tests/solution.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from ray import serve
|
||||
from ray.serve.drivers import DAGDriver
|
||||
from ray.dag.input_node import InputNode
|
||||
|
||||
"""
|
||||
We are building a DAG like this:
|
||||
A -> B ----> C
|
||||
\-> D --/
|
||||
\-> E -/
|
||||
"""
|
||||
|
||||
|
||||
@serve.deployment
|
||||
def a(val: int):
|
||||
return val
|
||||
|
||||
|
||||
@serve.deployment
|
||||
def b(val: int):
|
||||
return val + 1
|
||||
|
||||
|
||||
@serve.deployment
|
||||
def c(v1: int, v2: int, v3: int):
|
||||
return sum([v1, v2, v3])
|
||||
|
||||
|
||||
@serve.deployment
|
||||
def d(val):
|
||||
return val + 2
|
||||
|
||||
|
||||
@serve.deployment
|
||||
def e(val):
|
||||
return val + 3
|
||||
|
||||
|
||||
with InputNode() as user_input:
|
||||
oa = a.bind(user_input)
|
||||
ob = b.bind(oa)
|
||||
od = d.bind(oa)
|
||||
oe = e.bind(oa)
|
||||
oc = c.bind(ob, od, oe)
|
||||
|
||||
|
||||
def input_adapter(val: int):
|
||||
return val
|
||||
|
||||
|
||||
serve_entrypoint = DAGDriver.bind(oc, http_adapter=input_adapter)
|
|
@ -4643,7 +4643,7 @@
|
|||
cluster_compute: compute_tpl.yaml
|
||||
|
||||
run:
|
||||
timeout: 1800
|
||||
timeout: 28800 # 8h
|
||||
prepare: bash prepare.sh
|
||||
script: python run_gcs_ft_on_k8s.py
|
||||
type: sdk_command
|
||||
|
|
Loading…
Add table
Reference in a new issue