
Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

125 lines
5.1 KiB
Raw Normal View History

# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
# The configuration includes 1 Ray head pod and 9 Ray worker pods.
# Each Ray container requests 54 Gi memory and 14 CPU.
# For underlying Kubernetes node configuration, we suggest a node group or pool with
# the following features:
# - 10 virtual machines
# - 64 Gi memory and 16 CPU each
# (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
# - Each node should be configured with 1000 Gi of disk space (for data set storage).
# One Ray pod will be scheduled per Kubernetes node.
# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
# for K8s control processes and cloud-provider-specific daemons.
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# A note on autoscaling:
# If you wish to observe Ray autoscaling in this example, make the following modification:
# to your Kubernetes configuration:
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.
# Make the following changes to this configuration file:
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.
# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
# scale up to 9 nodes to accommodate the Ray pods.
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
# be scaled down.
apiVersion: ray.io/v1alpha1
kind: RayCluster
controller-tools.k8s.io: "1.0"
# An unique identifier for the head node and workers of this cluster.
name: raycluster-xgboost-benchmark
# Uncomment the next line to experiment with autoscaling.
# enableInTreeAutoscaling: true
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.0.0'
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
dashboard-host: ''
block: 'true'
# The Ray head container
- name: ray-head
image: rayproject/ray-ml:2.0.0
imagePullPolicy: Always
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
# resource accounting. K8s requests are not used by Ray.
cpu: "14"
memory: "54Gi"
cpu: "14"
memory: "54Gi"
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
command: ["/bin/sh","-c","ray stop"]
- replicas: 9
minReplicas: 9
maxReplicas: 9
# To experiment with autoscaling,
# set replicas and minReplicas to 0.
# replicas: 0
# minReplicas: 0
groupName: large-group
# the following params are used to complete the ray start: ray start --block
block: 'true'
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
image: rayproject/ray-ml:2.0.0
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
# resource accounting. K8s requests are not used by Ray.
# Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
cpu: "14"
memory: "54Gi"
cpu: "14"
memory: "54Gi"
command: ["/bin/sh","-c","ray stop"]
# Waits for availability of the Ray head's GCS service.
# the env var $RAY_IP is set by the operator, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]