ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/xgboost-benchmark.yaml

# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.

# The configuration includes 1 Ray head pod and 9 Ray worker pods.
# Each Ray container requests 54 Gi memory and 14 CPU.

# For underlying Kubernetes node configuration, we suggest a node group or pool with
# the following features:
# - 10 virtual machines
# - 64 Gi memory and 16 CPU each
#   (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
# - Each node should be configured with 1000 Gi of disk space (for data set storage).

# One Ray pod will be scheduled per Kubernetes node.

# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
# for K8s control processes and cloud-provider-specific daemons.
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
#
# A note on autoscaling:
# If you wish to observe Ray autoscaling in this example, make the following modification:
# to your Kubernetes configuration:
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.

# Make the following changes to this configuration file:
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.

# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
# scale up to 9 nodes to accommodate the Ray pods.
#
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
# be scaled down.
---
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
  labels:
    controller-tools.k8s.io: "1.0"
  name: raycluster-xgboost-benchmark
spec:
  # Uncomment the next line to experiment with autoscaling.
  # enableInTreeAutoscaling: true
  # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
  rayVersion: '2.0.0'
  headGroupSpec:
    # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
    serviceType: ClusterIP
    rayStartParams:
      dashboard-host: '0.0.0.0'
      block: 'true'
    template:
      spec:
        containers:
        # The Ray head container
        - name: ray-head
          image: rayproject/ray-ml:2.0.0
          imagePullPolicy: Always
          # Optimal resource allocation will depend on your Kubernetes infrastructure and might
          # require some experimentation.
          # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
          # resource accounting. K8s requests are not used by Ray.
          resources:
            limits:
              cpu: "14"
              memory: "54Gi"
            requests:
              cpu: "14"
              memory: "54Gi"
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
  workerGroupSpecs:
  - replicas: 9
    minReplicas: 9
    maxReplicas: 9
    # To experiment with autoscaling,
    # set replicas and minReplicas to 0.
    # replicas: 0
    # minReplicas: 0
    groupName: large-group
    # the following params are used to complete the ray start: ray start --block
    rayStartParams:
      block: 'true'
    template:
      spec:
        containers:
        - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc')
          image: rayproject/ray-ml:2.0.0
          # Optimal resource allocation will depend on your Kubernetes infrastructure and might
          # require some experimentation.
          # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
          # resource accounting. K8s requests are not used by Ray.
          resources:
            limits:
              # Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
              cpu: "14"
              memory: "54Gi"
            requests:
              cpu: "14"
              memory: "54Gi"
          lifecycle:
            preStop:
              exec:
                command: ["/bin/sh","-c","ray stop"]
        # Waits for availability of the Ray head's GCS service.
        initContainers:
        # the env var $RAY_IP is set by the operator, with the value of the head service name
        - name: init-myservice
          image: busybox:1.28
          command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]