mirror of
https://github.com/vale981/ray
synced 2025-03-12 06:06:39 -04:00
125 lines
5.1 KiB
YAML
125 lines
5.1 KiB
YAML
![]() |
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
||
|
|
||
|
# The configuration includes 1 Ray head pod and 9 Ray worker pods.
|
||
|
# Each Ray container requests 54 Gi memory and 14 CPU.
|
||
|
|
||
|
# For underlying Kubernetes node configuration, we suggest a node group or pool with
|
||
|
# the following features:
|
||
|
# - 10 virtual machines
|
||
|
# - 64 Gi memory and 16 CPU each
|
||
|
# (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
|
||
|
# - Each node should be configured with 1000 Gi of disk space (for data set storage).
|
||
|
|
||
|
# One Ray pod will be scheduled per Kubernetes node.
|
||
|
|
||
|
# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
|
||
|
# for K8s control processes and cloud-provider-specific daemons.
|
||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||
|
# require some experimentation.
|
||
|
#
|
||
|
# A note on autoscaling:
|
||
|
# If you wish to observe Ray autoscaling in this example, make the following modification:
|
||
|
# to your Kubernetes configuration:
|
||
|
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.
|
||
|
|
||
|
# Make the following changes to this configuration file:
|
||
|
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
|
||
|
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
|
||
|
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
|
||
|
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.
|
||
|
|
||
|
# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
|
||
|
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
|
||
|
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
|
||
|
# scale up to 9 nodes to accommodate the Ray pods.
|
||
|
#
|
||
|
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
|
||
|
# be scaled down.
|
||
|
---
|
||
|
apiVersion: ray.io/v1alpha1
|
||
|
kind: RayCluster
|
||
|
metadata:
|
||
|
labels:
|
||
|
controller-tools.k8s.io: "1.0"
|
||
|
# An unique identifier for the head node and workers of this cluster.
|
||
|
name: raycluster-xgboost-benchmark
|
||
|
spec:
|
||
|
# Uncomment the next line to experiment with autoscaling.
|
||
|
# enableInTreeAutoscaling: true
|
||
|
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
||
|
rayVersion: '2.0.0'
|
||
|
headGroupSpec:
|
||
|
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
||
|
serviceType: ClusterIP
|
||
|
rayStartParams:
|
||
|
dashboard-host: '0.0.0.0'
|
||
|
block: 'true'
|
||
|
template:
|
||
|
spec:
|
||
|
containers:
|
||
|
# The Ray head container
|
||
|
- name: ray-head
|
||
|
image: rayproject/ray-ml:2.0.0
|
||
|
imagePullPolicy: Always
|
||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||
|
# require some experimentation.
|
||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||
|
# resource accounting. K8s requests are not used by Ray.
|
||
|
resources:
|
||
|
limits:
|
||
|
cpu: "14"
|
||
|
memory: "54Gi"
|
||
|
requests:
|
||
|
cpu: "14"
|
||
|
memory: "54Gi"
|
||
|
ports:
|
||
|
- containerPort: 6379
|
||
|
name: gcs
|
||
|
- containerPort: 8265
|
||
|
name: dashboard
|
||
|
- containerPort: 10001
|
||
|
name: client
|
||
|
lifecycle:
|
||
|
preStop:
|
||
|
exec:
|
||
|
command: ["/bin/sh","-c","ray stop"]
|
||
|
workerGroupSpecs:
|
||
|
- replicas: 9
|
||
|
minReplicas: 9
|
||
|
maxReplicas: 9
|
||
|
# To experiment with autoscaling,
|
||
|
# set replicas and minReplicas to 0.
|
||
|
# replicas: 0
|
||
|
# minReplicas: 0
|
||
|
groupName: large-group
|
||
|
# the following params are used to complete the ray start: ray start --block
|
||
|
rayStartParams:
|
||
|
block: 'true'
|
||
|
template:
|
||
|
spec:
|
||
|
containers:
|
||
|
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
||
|
image: rayproject/ray-ml:2.0.0
|
||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||
|
# require some experimentation.
|
||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||
|
# resource accounting. K8s requests are not used by Ray.
|
||
|
resources:
|
||
|
limits:
|
||
|
# Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
|
||
|
cpu: "14"
|
||
|
memory: "54Gi"
|
||
|
requests:
|
||
|
cpu: "14"
|
||
|
memory: "54Gi"
|
||
|
lifecycle:
|
||
|
preStop:
|
||
|
exec:
|
||
|
command: ["/bin/sh","-c","ray stop"]
|
||
|
# Waits for availability of the Ray head's GCS service.
|
||
|
initContainers:
|
||
|
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
||
|
- name: init-myservice
|
||
|
image: busybox:1.28
|
||
|
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|