ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/xgboost-benchmark-autoscaler.yaml
Dmitri Gekhtman 6efca71c35
[docs][kubernetes] XGBoost ML example (#27313)
Adds a guide on running an XGBoost-Ray workload using KubeRay.

Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
2022-08-01 19:30:41 -07:00

90 lines
3.4 KiB
YAML

# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
# This configuration here modifies the file xgboost-benchmark.yaml in this directory
# to demonstrate autoscaling.
#
# See the discussion in xgboost-benchmark.yaml for further details.
---
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
controller-tools.k8s.io: "1.0"
# A unique identifier for the head node and workers of this cluster.
name: raycluster-xgboost-benchmark
spec:
# The KubeRay operator will insert the Ray autoscaler sidecar
# into the Ray head node's pod config:
enableInTreeAutoscaling: true
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.0.0'
headGroupSpec:
serviceType: ClusterIP
rayStartParams:
dashboard-host: '0.0.0.0'
block: 'true'
template:
spec:
containers:
# The Ray head container
- name: ray-head
image: rayproject/ray-ml:2.0.0
imagePullPolicy: Always
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
# resource accounting. K8s requests are not used by Ray.
resources:
limits:
cpu: "14"
memory: "54Gi"
requests:
cpu: "14"
memory: "54Gi"
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
workerGroupSpecs:
# Start with 0 workers. Allow scaling up to 9 workers.
- replicas: 0
minReplicas: 0
maxReplicas: 9
groupName: large-group
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
rayStartParams:
block: 'true'
template:
spec:
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
image: rayproject/ray-ml:2.0.0
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
# resource accounting. K8s requests are not used by Ray.
resources:
limits:
# Slightly less than 16 to accommodate placement on 16 vCPU virtual machine.
cpu: "14"
memory: "54Gi"
requests:
cpu: "14"
memory: "54Gi"
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
# Waits for availability of the Ray head's GCS service.
initContainers:
# the env var $RAY_IP is set by the operator, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]