mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[docs][kubernetes] XGBoost ML example (#27313)
Adds a guide on running an XGBoost-Ray workload using KubeRay. Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
parent
00d22b6c7c
commit
6efca71c35
9 changed files with 487 additions and 11 deletions
|
@ -0,0 +1,90 @@
|
||||||
|
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
||||||
|
|
||||||
|
# This configuration here modifies the file xgboost-benchmark.yaml in this directory
|
||||||
|
# to demonstrate autoscaling.
|
||||||
|
#
|
||||||
|
# See the discussion in xgboost-benchmark.yaml for further details.
|
||||||
|
---
|
||||||
|
apiVersion: ray.io/v1alpha1
|
||||||
|
kind: RayCluster
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
controller-tools.k8s.io: "1.0"
|
||||||
|
# A unique identifier for the head node and workers of this cluster.
|
||||||
|
name: raycluster-xgboost-benchmark
|
||||||
|
spec:
|
||||||
|
# The KubeRay operator will insert the Ray autoscaler sidecar
|
||||||
|
# into the Ray head node's pod config:
|
||||||
|
enableInTreeAutoscaling: true
|
||||||
|
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
||||||
|
rayVersion: '2.0.0'
|
||||||
|
headGroupSpec:
|
||||||
|
serviceType: ClusterIP
|
||||||
|
rayStartParams:
|
||||||
|
dashboard-host: '0.0.0.0'
|
||||||
|
block: 'true'
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
# The Ray head container
|
||||||
|
- name: ray-head
|
||||||
|
image: rayproject/ray-ml:2.0.0
|
||||||
|
imagePullPolicy: Always
|
||||||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||||
|
# require some experimentation.
|
||||||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||||
|
# resource accounting. K8s requests are not used by Ray.
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
requests:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
ports:
|
||||||
|
- containerPort: 6379
|
||||||
|
name: gcs
|
||||||
|
- containerPort: 8265
|
||||||
|
name: dashboard
|
||||||
|
- containerPort: 10001
|
||||||
|
name: client
|
||||||
|
lifecycle:
|
||||||
|
preStop:
|
||||||
|
exec:
|
||||||
|
command: ["/bin/sh","-c","ray stop"]
|
||||||
|
workerGroupSpecs:
|
||||||
|
# Start with 0 workers. Allow scaling up to 9 workers.
|
||||||
|
- replicas: 0
|
||||||
|
minReplicas: 0
|
||||||
|
maxReplicas: 9
|
||||||
|
groupName: large-group
|
||||||
|
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
|
||||||
|
rayStartParams:
|
||||||
|
block: 'true'
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
||||||
|
image: rayproject/ray-ml:2.0.0
|
||||||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||||
|
# require some experimentation.
|
||||||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||||
|
# resource accounting. K8s requests are not used by Ray.
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
# Slightly less than 16 to accommodate placement on 16 vCPU virtual machine.
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
requests:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
lifecycle:
|
||||||
|
preStop:
|
||||||
|
exec:
|
||||||
|
command: ["/bin/sh","-c","ray stop"]
|
||||||
|
# Waits for availability of the Ray head's GCS service.
|
||||||
|
initContainers:
|
||||||
|
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
||||||
|
- name: init-myservice
|
||||||
|
image: busybox:1.28
|
||||||
|
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
|
@ -0,0 +1,124 @@
|
||||||
|
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
||||||
|
|
||||||
|
# The configuration includes 1 Ray head pod and 9 Ray worker pods.
|
||||||
|
# Each Ray container requests 54 Gi memory and 14 CPU.
|
||||||
|
|
||||||
|
# For underlying Kubernetes node configuration, we suggest a node group or pool with
|
||||||
|
# the following features:
|
||||||
|
# - 10 virtual machines
|
||||||
|
# - 64 Gi memory and 16 CPU each
|
||||||
|
# (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
|
||||||
|
# - Each node should be configured with 1000 Gi of disk space (for data set storage).
|
||||||
|
|
||||||
|
# One Ray pod will be scheduled per Kubernetes node.
|
||||||
|
|
||||||
|
# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
|
||||||
|
# for K8s control processes and cloud-provider-specific daemons.
|
||||||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||||
|
# require some experimentation.
|
||||||
|
#
|
||||||
|
# A note on autoscaling:
|
||||||
|
# If you wish to observe Ray autoscaling in this example, make the following modification:
|
||||||
|
# to your Kubernetes configuration:
|
||||||
|
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.
|
||||||
|
|
||||||
|
# Make the following changes to this configuration file:
|
||||||
|
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
|
||||||
|
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
|
||||||
|
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
|
||||||
|
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.
|
||||||
|
|
||||||
|
# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
|
||||||
|
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
|
||||||
|
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
|
||||||
|
# scale up to 9 nodes to accommodate the Ray pods.
|
||||||
|
#
|
||||||
|
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
|
||||||
|
# be scaled down.
|
||||||
|
---
|
||||||
|
apiVersion: ray.io/v1alpha1
|
||||||
|
kind: RayCluster
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
controller-tools.k8s.io: "1.0"
|
||||||
|
# An unique identifier for the head node and workers of this cluster.
|
||||||
|
name: raycluster-xgboost-benchmark
|
||||||
|
spec:
|
||||||
|
# Uncomment the next line to experiment with autoscaling.
|
||||||
|
# enableInTreeAutoscaling: true
|
||||||
|
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
||||||
|
rayVersion: '2.0.0'
|
||||||
|
headGroupSpec:
|
||||||
|
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
||||||
|
serviceType: ClusterIP
|
||||||
|
rayStartParams:
|
||||||
|
dashboard-host: '0.0.0.0'
|
||||||
|
block: 'true'
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
# The Ray head container
|
||||||
|
- name: ray-head
|
||||||
|
image: rayproject/ray-ml:2.0.0
|
||||||
|
imagePullPolicy: Always
|
||||||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||||
|
# require some experimentation.
|
||||||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||||
|
# resource accounting. K8s requests are not used by Ray.
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
requests:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
ports:
|
||||||
|
- containerPort: 6379
|
||||||
|
name: gcs
|
||||||
|
- containerPort: 8265
|
||||||
|
name: dashboard
|
||||||
|
- containerPort: 10001
|
||||||
|
name: client
|
||||||
|
lifecycle:
|
||||||
|
preStop:
|
||||||
|
exec:
|
||||||
|
command: ["/bin/sh","-c","ray stop"]
|
||||||
|
workerGroupSpecs:
|
||||||
|
- replicas: 9
|
||||||
|
minReplicas: 9
|
||||||
|
maxReplicas: 9
|
||||||
|
# To experiment with autoscaling,
|
||||||
|
# set replicas and minReplicas to 0.
|
||||||
|
# replicas: 0
|
||||||
|
# minReplicas: 0
|
||||||
|
groupName: large-group
|
||||||
|
# the following params are used to complete the ray start: ray start --block
|
||||||
|
rayStartParams:
|
||||||
|
block: 'true'
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
||||||
|
image: rayproject/ray-ml:2.0.0
|
||||||
|
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||||
|
# require some experimentation.
|
||||||
|
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||||
|
# resource accounting. K8s requests are not used by Ray.
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
# Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
requests:
|
||||||
|
cpu: "14"
|
||||||
|
memory: "54Gi"
|
||||||
|
lifecycle:
|
||||||
|
preStop:
|
||||||
|
exec:
|
||||||
|
command: ["/bin/sh","-c","ray stop"]
|
||||||
|
# Waits for availability of the Ray head's GCS service.
|
||||||
|
initContainers:
|
||||||
|
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
||||||
|
- name: init-myservice
|
||||||
|
image: busybox:1.28
|
||||||
|
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
|
@ -0,0 +1,19 @@
|
||||||
|
from ray.job_submission import JobSubmissionClient
|
||||||
|
|
||||||
|
client = JobSubmissionClient("http://127.0.0.1:8265")
|
||||||
|
|
||||||
|
kick_off_xgboost_benchmark = (
|
||||||
|
# Clone ray. If ray is already present, don't clone again.
|
||||||
|
"git clone https://github.com/ray-project/ray || true;"
|
||||||
|
# Run the benchmark.
|
||||||
|
" python ray/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py"
|
||||||
|
" --size 100G --disable-check"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
submission_id = client.submit_job(
|
||||||
|
entrypoint=kick_off_xgboost_benchmark,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Use the following command to follow this Job's logs:")
|
||||||
|
print(f"ray job logs '{submission_id}' --follow")
|
|
@ -1,4 +1,11 @@
|
||||||
|
(kuberay-examples)=
|
||||||
|
|
||||||
# Examples
|
# Examples
|
||||||
:::{warning}
|
|
||||||
This page is under construction!
|
:::{note}
|
||||||
|
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||||
|
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
This section presents example Ray workloads try out on your Kubernetes cluster.
|
||||||
|
- {ref}`kuberay-ml-example`
|
||||||
|
|
|
@ -1,10 +1,235 @@
|
||||||
(kuberay-ml-example)=
|
(kuberay-ml-example)=
|
||||||
|
|
||||||
# Example machine learning workloads
|
# XGBoost-Ray on Kubernetes
|
||||||
|
|
||||||
:::{warning}
|
:::{note}
|
||||||
This page is under construction!
|
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||||
|
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
At least one end-to-end example of an actual machine learning workload,
|
|
||||||
preferably with GPUs, possibly engaging the autoscaling functionality.
|
In this guide, we show you how to run a sample Ray machine learning
|
||||||
|
workload on Kubernetes infrastructure.
|
||||||
|
|
||||||
|
We will run Ray's {ref}`XGBoost training benchmark<xgboost-benchmark>` with a 100 gigabyte training set.
|
||||||
|
To learn more about XGBoost-Ray, check out that library's {ref}`documentation<xgboost-ray>`.
|
||||||
|
|
||||||
|
## Kubernetes infrastructure setup
|
||||||
|
|
||||||
|
### Managed Kubernetes services
|
||||||
|
|
||||||
|
Running the example in this guide requires basic Kubernetes infrastructure set-up.
|
||||||
|
We collect helpful links for users who are getting started with a managed Kubernetes service.
|
||||||
|
|
||||||
|
:::{tabbed} GKE (Google Cloud)
|
||||||
|
You can find the landing page for GKE [here](https://cloud.google.com/kubernetes-engine).
|
||||||
|
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||||
|
Alternatively, check out the [documentation](https://cloud.google.com/kubernetes-engine/docs/) and
|
||||||
|
[quickstart guides](https://cloud.google.com/kubernetes-engine/docs/deploy-app-cluster). To successfully deploy Ray on Kubernetes,
|
||||||
|
you will need to configure pools of Kubernetes nodes;
|
||||||
|
find guidance [here](https://cloud.google.com/kubernetes-engine/docs/concepts/node-pools).
|
||||||
|
:::
|
||||||
|
|
||||||
|
:::{tabbed} EKS (Amazon Web Services)
|
||||||
|
You can find the landing page for EKS [here](https://aws.amazon.com/eks/).
|
||||||
|
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||||
|
Alternatively, check out the [documentation](https://docs.aws.amazon.com/eks/latest/userguide/) and
|
||||||
|
[quickstart guides](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html). To successfully deploy Ray on Kubernetes,
|
||||||
|
you will need to configure groups of Kubernetes nodes;
|
||||||
|
find guidance [here](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html).
|
||||||
|
:::
|
||||||
|
|
||||||
|
:::{tabbed} AKS (Microsoft Azure)
|
||||||
|
You can find the landing page for AKS [here](https://azure.microsoft.com/en-us/services/kubernetes-service/).
|
||||||
|
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||||
|
Alternatively, check out the [documentation](https://docs.microsoft.com/en-us/azure/aks/) and
|
||||||
|
[quickstart guides](https://docs.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-portal?tabs=azure-cli). To successfully deploy Ray on Kubernetes,
|
||||||
|
you will need to configure pools of Kubernetes nodes;
|
||||||
|
find guidance [here](https://docs.microsoft.com/en-us/azure/aks/use-multiple-node-pools).
|
||||||
|
:::
|
||||||
|
|
||||||
|
```{admonition} Optional: Autoscaling
|
||||||
|
This guide includes notes on how to deploy the XGBoost benchmark with optional Ray Autoscaler support.
|
||||||
|
Here are some considerations to keep in mind when choosing whether to use autoscaling.\
|
||||||
|
**Autoscaling: Pros**\
|
||||||
|
_Cope with unknown resource requirements._ If you don't know how much compute your Ray
|
||||||
|
workload will require, autoscaling can adjust your Ray cluster to the right size.\
|
||||||
|
_Save on costs._ Idle compute is automatically scaled down, potentially leading to cost savings.\
|
||||||
|
**Autoscaling: Cons**\
|
||||||
|
_Less predictable when resource requirements are known._ If you already know exactly
|
||||||
|
how much compute your workload requires, it makes sense to provision a statically-sized Ray cluster.
|
||||||
|
In this guide's example, we know that we need 1 Ray head and 9 Ray workers,
|
||||||
|
so autoscaling is not strictly required.\
|
||||||
|
_Longer end-to-end runtime._ Autoscaling entails provisioning compute for Ray workers
|
||||||
|
while the Ray application is running. On the other hand, if you pre-provision a fixed
|
||||||
|
number of Ray nodes,
|
||||||
|
all of the Ray nodes can be started in parallel, potentially reducing your application's
|
||||||
|
runtime.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Set up a node pool for the XGBoost benchmark
|
||||||
|
|
||||||
|
For the workload in this guide, it is recommended to use a pool or group of Kubernetes nodes
|
||||||
|
with the following properties:
|
||||||
|
- 10 nodes total
|
||||||
|
- A capacity of 16 CPU and 64 Gi memory per node. For the major cloud providers, suitable instance types include
|
||||||
|
* m5.4xlarge (Amazon Web Services)
|
||||||
|
* Standard_D5_v2 (Azure)
|
||||||
|
* e2-standard-16 (Google Cloud)
|
||||||
|
- Each node should be configured with 1000 gigabytes of disk space (to store the training set).
|
||||||
|
|
||||||
|
```{admonition} Optional: Set up an autoscaling node pool
|
||||||
|
**If you would like to try running the workload with autoscaling enabled**, use an autoscaling
|
||||||
|
node group or pool with a 1 node minimum and a 10 node maximum.
|
||||||
|
The 1 static node will be used to run the Ray head pod. This node may also host the KubeRay
|
||||||
|
operator and Kubernetes system components. After the workload is submitted, 9 additional nodes will
|
||||||
|
scale up to accommodate Ray worker pods. These nodes will scale back down after the workload is complete.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deploying the KubeRay operator
|
||||||
|
|
||||||
|
Once you have set up your Kubernetes cluster, deploy the KubeRay operator:
|
||||||
|
```shell
|
||||||
|
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.3.0-rc.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deploying a Ray cluster
|
||||||
|
|
||||||
|
Now we're ready to deploy the Ray cluster that will execute our workload.
|
||||||
|
|
||||||
|
:::{tip}
|
||||||
|
The Ray cluster we'll deploy is configured such that one Ray pod will be scheduled
|
||||||
|
per 16-CPU Kubernetes node. The pattern of one Ray pod per Kubernetes node is encouraged, but not required.
|
||||||
|
Broadly speaking, it is more efficient to use a few large Ray pods than many small ones.
|
||||||
|
:::
|
||||||
|
|
||||||
|
We recommend taking a look at the config file applied in the following command.
|
||||||
|
```shell
|
||||||
|
# Starting from the parent directory of cloned Ray master,
|
||||||
|
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/
|
||||||
|
kubectl apply -f xgboost-benchmark.yaml
|
||||||
|
popd
|
||||||
|
```
|
||||||
|
|
||||||
|
A Ray head pod and 9 Ray worker pods will be created.
|
||||||
|
|
||||||
|
|
||||||
|
```{admonition} Optional: Deploying an autoscaling Ray cluster
|
||||||
|
If you've set up an autoscaling node group or pool, you may wish to deploy
|
||||||
|
an autoscaling cluster by applying the config `xgboost-benchmark-autoscaler.yaml`.
|
||||||
|
One Ray head pod will be created. Once the workload starts, the Ray autoscaler will trigger
|
||||||
|
creation of Ray worker pods. Kubernetes autoscaling will then create nodes to place the Ray pods.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running the workload
|
||||||
|
|
||||||
|
To observe the startup progress of the Ray head pod, run the following command.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# If you're on MacOS, first `brew install watch`.
|
||||||
|
watch -n 1 kubectl get pod
|
||||||
|
```
|
||||||
|
|
||||||
|
Once the Ray head pod enters `Running` state, we are ready to execute the XGBoost workload.
|
||||||
|
We will use {ref}`Ray Job Submission<jobs-overview>` to kick off the workload.
|
||||||
|
|
||||||
|
### Connect to the cluster.
|
||||||
|
|
||||||
|
First, we connect to the Job server. Run the following blocking command
|
||||||
|
in a separate shell.
|
||||||
|
```shell
|
||||||
|
kubectl port-forward service/raycluster-xgboost-benchmark-head-svc 8265:8265
|
||||||
|
```
|
||||||
|
|
||||||
|
### Submit the workload.
|
||||||
|
|
||||||
|
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||||
|
|
||||||
|
```{literalinclude} ../doc_code/xgboost_submit.py
|
||||||
|
:language: python
|
||||||
|
```
|
||||||
|
|
||||||
|
To submit the workload, run the above Python script.
|
||||||
|
The script is available in the Ray repository.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# From the parent directory of cloned Ray master.
|
||||||
|
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/doc_code/
|
||||||
|
python xgboost_submit.py
|
||||||
|
popd
|
||||||
|
```
|
||||||
|
|
||||||
|
### Observe progress.
|
||||||
|
|
||||||
|
The benchmark may take up to 30 minutes to run.
|
||||||
|
Use the following tools to observe its progress.
|
||||||
|
|
||||||
|
#### Job logs
|
||||||
|
|
||||||
|
To follow the job's logs, use the command printed by the above submission script.
|
||||||
|
```shell
|
||||||
|
# Subsitute the Ray Job's submission id.
|
||||||
|
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Kubectl
|
||||||
|
|
||||||
|
Observe the pods in your cluster with
|
||||||
|
```shell
|
||||||
|
# If you're on MacOS, first `brew install watch`.
|
||||||
|
watch -n 1 kubectl get pod
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Ray Dashboard
|
||||||
|
|
||||||
|
View `localhost:8265` in your browser to access the Ray Dashboard.
|
||||||
|
|
||||||
|
#### Ray Status
|
||||||
|
|
||||||
|
Observe autoscaling status and Ray resource usage with
|
||||||
|
```shell
|
||||||
|
# Substitute the name of your Ray cluster's head pod.
|
||||||
|
watch -n 1 kubectl exec -it raycluster-xgboost-benchmark-head-xxxxx -- ray status
|
||||||
|
```
|
||||||
|
|
||||||
|
:::{note}
|
||||||
|
Under some circumstances and for certain cloud providers,
|
||||||
|
the K8s API server may become briefly unavailable during Kuberentes
|
||||||
|
cluster resizing events.
|
||||||
|
|
||||||
|
Don't worry if that happens -- the Ray workload should be uninterrupted.
|
||||||
|
For the example in this guide, wait until the API server is back up, restart the port-forwarding process,
|
||||||
|
and re-run the job log command.
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Job completion
|
||||||
|
|
||||||
|
#### Benchmark results
|
||||||
|
|
||||||
|
Once the benchmark is complete, the job log will display the results:
|
||||||
|
|
||||||
|
```
|
||||||
|
Results: {'training_time': 1338.488839321999, 'prediction_time': 403.36653568099973}
|
||||||
|
```
|
||||||
|
|
||||||
|
The performance of the benchmark is sensitive to the underlying cloud infrastructure --
|
||||||
|
you might not match {ref}`the numbers quoted in the benchmark docs<xgboost-benchmark>`.
|
||||||
|
|
||||||
|
#### Model parameters
|
||||||
|
The file `model.json` in the Ray head pod contains the parameters for the trained model.
|
||||||
|
Other result data will be available in the directory `ray_results` in the head pod.
|
||||||
|
Refer to the {ref}`XGBoost-Ray documentation<xgboost-ray>` for details.
|
||||||
|
|
||||||
|
```{admonition} Scale-down
|
||||||
|
If autoscaling is enabled, Ray worker pods will scale down after 60 seconds.
|
||||||
|
After the Ray worker pods are gone, your Kubernetes infrastructure should scale down
|
||||||
|
the nodes that hosted these pods.
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Clean-up
|
||||||
|
Delete your Ray cluster with the following command:
|
||||||
|
```shell
|
||||||
|
kubectl delete raycluster raycluster-xgboost-benchmark
|
||||||
|
```
|
||||||
|
If you're on a public cloud, don't forget to clean up the underlying
|
||||||
|
node group and/or Kubernetes cluster.
|
||||||
|
|
|
@ -85,8 +85,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# After the KubeRay 0.3.0 release branch cut, this documentation will be updated to refer to the 0.3.0 branch.\n",
|
"! git clone https://github.com/ray-project/kuberay -b release-0.3\n",
|
||||||
"! git clone https://github.com/ray-project/kuberay\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# This creates the KubeRay operator and all of the resources it needs.\n",
|
"# This creates the KubeRay operator and all of the resources it needs.\n",
|
||||||
"! kubectl create -k kuberay/ray-operator/config/default\n",
|
"! kubectl create -k kuberay/ray-operator/config/default\n",
|
||||||
|
|
|
@ -29,6 +29,17 @@ The Ray docs present all the information you need to start running Ray workloads
|
||||||
:type: ref
|
:type: ref
|
||||||
:text: Get Started with Ray on Kubernetes
|
:text: Get Started with Ray on Kubernetes
|
||||||
:classes: btn-outline-info btn-block
|
:classes: btn-outline-info btn-block
|
||||||
|
---
|
||||||
|
**Getting started**
|
||||||
|
^^^
|
||||||
|
|
||||||
|
Try example Ray workloads on Kubernetes.
|
||||||
|
|
||||||
|
+++
|
||||||
|
.. link-button:: kuberay-examples
|
||||||
|
:type: ref
|
||||||
|
:text: Example workloads
|
||||||
|
:classes: btn-outline-info btn-block
|
||||||
```
|
```
|
||||||
## The KubeRay project
|
## The KubeRay project
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ We test out the performance across different cluster sizes and data sizes.
|
||||||
- 331 s (786k rows/s)
|
- 331 s (786k rows/s)
|
||||||
- `python xgboost_benchmark.py --size 100GB`
|
- `python xgboost_benchmark.py --size 100GB`
|
||||||
|
|
||||||
|
.. _xgboost-benchmark:
|
||||||
|
|
||||||
XGBoost training
|
XGBoost training
|
||||||
----------------
|
----------------
|
||||||
|
@ -255,4 +256,4 @@ Performance may vary greatly across different model, hardware, and cluster confi
|
||||||
.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
||||||
.. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
|
.. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
|
||||||
.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4.yaml
|
.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4.yaml
|
||||||
.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
||||||
|
|
|
@ -5,7 +5,7 @@ More Ray ML Libraries
|
||||||
Going forward, make sure that all "Ray Lightning" and XGBoost topics are in one document or group,
|
Going forward, make sure that all "Ray Lightning" and XGBoost topics are in one document or group,
|
||||||
and not next to each other.
|
and not next to each other.
|
||||||
|
|
||||||
Ray has a variety of different extra integrations with ecosystem libraries.
|
Ray has a variety of additional integrations with ecosystem libraries.
|
||||||
|
|
||||||
- :ref:`ray-joblib`
|
- :ref:`ray-joblib`
|
||||||
- :ref:`lightgbm-ray`
|
- :ref:`lightgbm-ray`
|
||||||
|
|
Loading…
Add table
Reference in a new issue