ray/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml
Yi Cheng 4d91f516ca
[nightly] Add serve ha chaos test into nightly test. (#27413)
This PR adds a serve ha test. The flow of the tests is:

1. check the kube ray build
2. start ray service
3. warm up the cluster
4. start killing nodes
5. get the stats and make sure it's good
2022-08-29 16:55:36 -07:00

309 lines
10 KiB
YAML

kind: ConfigMap
apiVersion: v1
metadata:
name: locusttest-{cluster_id}
data:
locustfile.py: |
{locustfile}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: script-{cluster_id}
data:
solution.py: |
{solution}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: redis-config-{cluster_id}
labels:
app: redis
data:
redis.conf: |-
dir /data
port 6379
bind 0.0.0.0
appendonly yes
protected-mode no
requirepass 5241590000000000
pidfile /data/redis-6379.pid
---
apiVersion: v1
kind: Service
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
type: ClusterIP
ports:
- name: redis
port: 6379
selector:
app: redis
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:5.0.8
command:
- "sh"
- "-c"
- "redis-server /usr/local/etc/redis/redis.conf"
ports:
- containerPort: 6379
volumeMounts:
- name: config
mountPath: /usr/local/etc/redis/redis.conf
subPath: redis.conf
volumes:
- name: config
configMap:
name: redis-config-{cluster_id}
---
apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
name: service-{cluster_id}
annotations:
ray.io/ft-enabled: "true"
spec:
serviceUnhealthySecondThreshold: 300
deploymentUnhealthySecondThreshold: 300
serveConfig:
importPath: solution.serve_entrypoint
runtimeEnv: |
env_vars:
PYTHONPATH: "/tmp/testing/"
deployments:
- name: a
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: b
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: c
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: d
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: e
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: DAGDriver
numReplicas: 6
rayActorOptions:
numCpus: 1
rayClusterConfig:
rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# the pod replicas in this group typed head (assuming there could be more than 1 in the future)
replicas: 1
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
rayStartParams:
port: '6379' # should match container port named gcs-server
object-store-memory: '100000000'
dashboard-host: '0.0.0.0'
num-cpus: '0' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
labels:
rayCluster: cluster-{cluster_id}
rayNodeType: head # will be injected if missing, must be head or wroker
groupName: headgroup # will be injected if missing
# annotations for pod
annotations:
key: value
spec:
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: ray-head
image: {ray_image}
imagePullPolicy: Always
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_REDIS_ADDRESS
value: redis-{cluster_id}:6379
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: RAY_num_heartbeats_timeout
value: "120"
- name: RAY_gcs_failover_worker_reconnect_timeout
value: "600"
resources:
limits:
cpu: 2
requests:
cpu: 2
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 12
minReplicas: 12
maxReplicas: 12
# logical group name, for this called small-group, also can be functional
groupName: small-group
# if worker pods need to be added, we can simply increment the replicas
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
# the operator will remove pods from the list until the number of replicas is satisfied
# when a pod is confirmed to be deleted, its name will be removed from the list below
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
rayStartParams:
node-ip-address: $MY_POD_IP
block: 'true'
num-cpus: '4' # can be auto-completed from the limits
#pod template
template:
metadata:
labels:
key: value
rayCluster: cluster-{cluster_id}
# annotations for pod
annotations:
key: value
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: {ray_image}
imagePullPolicy: Always
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
readinessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
# environment variables to set in the container.Optional.
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
env:
- name: RAY_DISABLE_DOCKER_CPU_WARNING
value: "1"
- name: TYPE
value: "worker"
- name: CPU_REQUEST
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.cpu
- name: CPU_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.cpu
- name: MEMORY_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.memory
- name: MEMORY_REQUESTS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.memory
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: RAY_num_heartbeats_timeout
value: "120"
- name: RAY_gcs_failover_worker_reconnect_timeout
value: "600"
- name: RAY_gcs_server_request_timeout_seconds
value: "5"
ports:
- containerPort: 80
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
requests:
cpu: "2"
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume