mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00

This PR adds a serve ha test. The flow of the tests is: 1. check the kube ray build 2. start ray service 3. warm up the cluster 4. start killing nodes 5. get the stats and make sure it's good
309 lines
10 KiB
YAML
309 lines
10 KiB
YAML
kind: ConfigMap
|
|
apiVersion: v1
|
|
metadata:
|
|
name: locusttest-{cluster_id}
|
|
data:
|
|
locustfile.py: |
|
|
{locustfile}
|
|
---
|
|
kind: ConfigMap
|
|
apiVersion: v1
|
|
metadata:
|
|
name: script-{cluster_id}
|
|
data:
|
|
solution.py: |
|
|
{solution}
|
|
---
|
|
kind: ConfigMap
|
|
apiVersion: v1
|
|
metadata:
|
|
name: redis-config-{cluster_id}
|
|
labels:
|
|
app: redis
|
|
data:
|
|
redis.conf: |-
|
|
dir /data
|
|
port 6379
|
|
bind 0.0.0.0
|
|
appendonly yes
|
|
protected-mode no
|
|
requirepass 5241590000000000
|
|
pidfile /data/redis-6379.pid
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: redis-{cluster_id}
|
|
labels:
|
|
app: redis
|
|
spec:
|
|
type: ClusterIP
|
|
ports:
|
|
- name: redis
|
|
port: 6379
|
|
selector:
|
|
app: redis
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: redis-{cluster_id}
|
|
labels:
|
|
app: redis
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: redis
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: redis
|
|
spec:
|
|
containers:
|
|
- name: redis
|
|
image: redis:5.0.8
|
|
command:
|
|
- "sh"
|
|
- "-c"
|
|
- "redis-server /usr/local/etc/redis/redis.conf"
|
|
ports:
|
|
- containerPort: 6379
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /usr/local/etc/redis/redis.conf
|
|
subPath: redis.conf
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: redis-config-{cluster_id}
|
|
---
|
|
apiVersion: ray.io/v1alpha1
|
|
kind: RayService
|
|
metadata:
|
|
name: service-{cluster_id}
|
|
annotations:
|
|
ray.io/ft-enabled: "true"
|
|
spec:
|
|
serviceUnhealthySecondThreshold: 300
|
|
deploymentUnhealthySecondThreshold: 300
|
|
serveConfig:
|
|
importPath: solution.serve_entrypoint
|
|
runtimeEnv: |
|
|
env_vars:
|
|
PYTHONPATH: "/tmp/testing/"
|
|
deployments:
|
|
- name: a
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
- name: b
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
- name: c
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
- name: d
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
- name: e
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
- name: DAGDriver
|
|
numReplicas: 6
|
|
rayActorOptions:
|
|
numCpus: 1
|
|
rayClusterConfig:
|
|
rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
|
|
######################headGroupSpecs#################################
|
|
# head group template and specs, (perhaps 'group' is not needed in the name)
|
|
headGroupSpec:
|
|
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
|
serviceType: ClusterIP
|
|
# the pod replicas in this group typed head (assuming there could be more than 1 in the future)
|
|
replicas: 1
|
|
# logical group name, for this called head-group, also can be functional
|
|
# pod type head or worker
|
|
# rayNodeType: head # Not needed since it is under the headgroup
|
|
# the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
|
|
rayStartParams:
|
|
port: '6379' # should match container port named gcs-server
|
|
object-store-memory: '100000000'
|
|
dashboard-host: '0.0.0.0'
|
|
num-cpus: '0' # can be auto-completed from the limits
|
|
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
|
|
block: 'true'
|
|
#pod template
|
|
template:
|
|
metadata:
|
|
labels:
|
|
rayCluster: cluster-{cluster_id}
|
|
rayNodeType: head # will be injected if missing, must be head or wroker
|
|
groupName: headgroup # will be injected if missing
|
|
# annotations for pod
|
|
annotations:
|
|
key: value
|
|
spec:
|
|
volumes:
|
|
- name: script
|
|
configMap:
|
|
name: script-{cluster_id}
|
|
- name: log-volume
|
|
emptyDir: {{}}
|
|
containers:
|
|
- name: ray-head
|
|
image: {ray_image}
|
|
imagePullPolicy: Always
|
|
env:
|
|
- name: MY_POD_IP
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: status.podIP
|
|
- name: RAY_REDIS_ADDRESS
|
|
value: redis-{cluster_id}:6379
|
|
- name: RAY_gcs_rpc_server_reconnect_timeout_s
|
|
value: "600"
|
|
- name: RAY_num_heartbeats_timeout
|
|
value: "120"
|
|
- name: RAY_gcs_failover_worker_reconnect_timeout
|
|
value: "600"
|
|
resources:
|
|
limits:
|
|
cpu: 2
|
|
requests:
|
|
cpu: 2
|
|
ports:
|
|
- containerPort: 6379
|
|
name: gcs-server
|
|
- containerPort: 8265 # Ray dashboard
|
|
name: dashboard
|
|
- containerPort: 10001
|
|
name: client
|
|
- containerPort: 8000
|
|
name: serve
|
|
volumeMounts:
|
|
- name: script
|
|
mountPath: /tmp/testing/solution.py
|
|
subPath: solution.py
|
|
- mountPath: /tmp/ray/
|
|
name: log-volume
|
|
workerGroupSpecs:
|
|
# the pod replicas in this group typed worker
|
|
- replicas: 12
|
|
minReplicas: 12
|
|
maxReplicas: 12
|
|
# logical group name, for this called small-group, also can be functional
|
|
groupName: small-group
|
|
# if worker pods need to be added, we can simply increment the replicas
|
|
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
|
|
# the operator will remove pods from the list until the number of replicas is satisfied
|
|
# when a pod is confirmed to be deleted, its name will be removed from the list below
|
|
#scaleStrategy:
|
|
# workersToDelete:
|
|
# - raycluster-complete-worker-small-group-bdtwh
|
|
# - raycluster-complete-worker-small-group-hv457
|
|
# - raycluster-complete-worker-small-group-k8tj7
|
|
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
|
|
rayStartParams:
|
|
node-ip-address: $MY_POD_IP
|
|
block: 'true'
|
|
num-cpus: '4' # can be auto-completed from the limits
|
|
#pod template
|
|
template:
|
|
metadata:
|
|
labels:
|
|
key: value
|
|
rayCluster: cluster-{cluster_id}
|
|
# annotations for pod
|
|
annotations:
|
|
key: value
|
|
spec:
|
|
initContainers:
|
|
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
|
|
- name: init-myservice
|
|
image: busybox:1.28
|
|
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
|
volumes:
|
|
- name: script
|
|
configMap:
|
|
name: script-{cluster_id}
|
|
- name: log-volume
|
|
emptyDir: {{}}
|
|
containers:
|
|
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
|
|
image: {ray_image}
|
|
imagePullPolicy: Always
|
|
livenessProbe:
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 5
|
|
timeoutSeconds: 10
|
|
readinessProbe:
|
|
initialDelaySeconds: 30
|
|
periodSeconds: 5
|
|
timeoutSeconds: 10
|
|
|
|
# environment variables to set in the container.Optional.
|
|
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
|
|
env:
|
|
- name: RAY_DISABLE_DOCKER_CPU_WARNING
|
|
value: "1"
|
|
- name: TYPE
|
|
value: "worker"
|
|
- name: CPU_REQUEST
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
containerName: machine-learning
|
|
resource: requests.cpu
|
|
- name: CPU_LIMITS
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
containerName: machine-learning
|
|
resource: limits.cpu
|
|
- name: MEMORY_LIMITS
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
containerName: machine-learning
|
|
resource: limits.memory
|
|
- name: MEMORY_REQUESTS
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
containerName: machine-learning
|
|
resource: requests.memory
|
|
- name: MY_POD_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: metadata.name
|
|
- name: MY_POD_IP
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: status.podIP
|
|
- name: RAY_gcs_rpc_server_reconnect_timeout_s
|
|
value: "600"
|
|
- name: RAY_num_heartbeats_timeout
|
|
value: "120"
|
|
- name: RAY_gcs_failover_worker_reconnect_timeout
|
|
value: "600"
|
|
- name: RAY_gcs_server_request_timeout_seconds
|
|
value: "5"
|
|
ports:
|
|
- containerPort: 80
|
|
name: client
|
|
lifecycle:
|
|
preStop:
|
|
exec:
|
|
command: ["/bin/sh","-c","ray stop"]
|
|
resources:
|
|
limits:
|
|
cpu: "2"
|
|
requests:
|
|
cpu: "2"
|
|
volumeMounts:
|
|
- name: script
|
|
mountPath: /tmp/testing/solution.py
|
|
subPath: solution.py
|
|
- mountPath: /tmp/ray/
|
|
name: log-volume
|