mirror of
https://github.com/vale981/ray
synced 2025-03-13 14:46:38 -04:00

This PR Adds notes and example on logging for Ray/K8s. Implements an API Reference paging pointing to the configuration guide and the RayCluster CR definition. Takes managed K8s services out of the tabbed structure, to make that page look less sad. Adds a comparison of the KubeRay operator and legacy K8s operator Adds an architecture diagram for the autoscaling sections Fixes some other minor items Adds some info about networking to the configuration guide, removes the previously planned networking page Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
116 lines
4.9 KiB
YAML
116 lines
4.9 KiB
YAML
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
|
|
|
# The configuration includes 1 Ray head pod and 9 Ray worker pods.
|
|
# Each Ray container requests 54 Gi memory and 14 CPU.
|
|
|
|
# For underlying Kubernetes node configuration, we suggest a node group or pool with
|
|
# the following features:
|
|
# - 10 virtual machines
|
|
# - 64 Gi memory and 16 CPU each
|
|
# (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
|
|
# - Each node should be configured with 1000 Gi of disk space (for data set storage).
|
|
|
|
# One Ray pod will be scheduled per Kubernetes node.
|
|
|
|
# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
|
|
# for K8s control processes and cloud-provider-specific daemons.
|
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
|
# require some experimentation.
|
|
#
|
|
# A note on autoscaling:
|
|
# If you wish to observe Ray autoscaling in this example, make the following modification:
|
|
# to your Kubernetes configuration:
|
|
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.
|
|
|
|
# Make the following changes to this configuration file:
|
|
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
|
|
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
|
|
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
|
|
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.
|
|
|
|
# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
|
|
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
|
|
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
|
|
# scale up to 9 nodes to accommodate the Ray pods.
|
|
#
|
|
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
|
|
# be scaled down.
|
|
---
|
|
apiVersion: ray.io/v1alpha1
|
|
kind: RayCluster
|
|
metadata:
|
|
labels:
|
|
controller-tools.k8s.io: "1.0"
|
|
name: raycluster-xgboost-benchmark
|
|
spec:
|
|
# Uncomment the next line to experiment with autoscaling.
|
|
# enableInTreeAutoscaling: true
|
|
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
|
rayVersion: '2.0.0'
|
|
headGroupSpec:
|
|
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
|
serviceType: ClusterIP
|
|
rayStartParams:
|
|
dashboard-host: '0.0.0.0'
|
|
block: 'true'
|
|
template:
|
|
spec:
|
|
containers:
|
|
# The Ray head container
|
|
- name: ray-head
|
|
image: rayproject/ray-ml:2.0.0
|
|
imagePullPolicy: Always
|
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
|
# require some experimentation.
|
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
|
# resource accounting. K8s requests are not used by Ray.
|
|
resources:
|
|
limits:
|
|
cpu: "14"
|
|
memory: "54Gi"
|
|
requests:
|
|
cpu: "14"
|
|
memory: "54Gi"
|
|
lifecycle:
|
|
preStop:
|
|
exec:
|
|
command: ["/bin/sh","-c","ray stop"]
|
|
workerGroupSpecs:
|
|
- replicas: 9
|
|
minReplicas: 9
|
|
maxReplicas: 9
|
|
# To experiment with autoscaling,
|
|
# set replicas and minReplicas to 0.
|
|
# replicas: 0
|
|
# minReplicas: 0
|
|
groupName: large-group
|
|
# the following params are used to complete the ray start: ray start --block
|
|
rayStartParams:
|
|
block: 'true'
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
|
image: rayproject/ray-ml:2.0.0
|
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
|
# require some experimentation.
|
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
|
# resource accounting. K8s requests are not used by Ray.
|
|
resources:
|
|
limits:
|
|
# Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
|
|
cpu: "14"
|
|
memory: "54Gi"
|
|
requests:
|
|
cpu: "14"
|
|
memory: "54Gi"
|
|
lifecycle:
|
|
preStop:
|
|
exec:
|
|
command: ["/bin/sh","-c","ray stop"]
|
|
# Waits for availability of the Ray head's GCS service.
|
|
initContainers:
|
|
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
|
- name: init-myservice
|
|
image: busybox:1.28
|
|
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|