mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[docs][kubernetes] XGBoost ML example (#27313)
Adds a guide on running an XGBoost-Ray workload using KubeRay. Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
parent
00d22b6c7c
commit
6efca71c35
9 changed files with 487 additions and 11 deletions
|
@ -0,0 +1,90 @@
|
|||
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
||||
|
||||
# This configuration here modifies the file xgboost-benchmark.yaml in this directory
|
||||
# to demonstrate autoscaling.
|
||||
#
|
||||
# See the discussion in xgboost-benchmark.yaml for further details.
|
||||
---
|
||||
apiVersion: ray.io/v1alpha1
|
||||
kind: RayCluster
|
||||
metadata:
|
||||
labels:
|
||||
controller-tools.k8s.io: "1.0"
|
||||
# A unique identifier for the head node and workers of this cluster.
|
||||
name: raycluster-xgboost-benchmark
|
||||
spec:
|
||||
# The KubeRay operator will insert the Ray autoscaler sidecar
|
||||
# into the Ray head node's pod config:
|
||||
enableInTreeAutoscaling: true
|
||||
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
||||
rayVersion: '2.0.0'
|
||||
headGroupSpec:
|
||||
serviceType: ClusterIP
|
||||
rayStartParams:
|
||||
dashboard-host: '0.0.0.0'
|
||||
block: 'true'
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
# The Ray head container
|
||||
- name: ray-head
|
||||
image: rayproject/ray-ml:2.0.0
|
||||
imagePullPolicy: Always
|
||||
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||
# require some experimentation.
|
||||
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||
# resource accounting. K8s requests are not used by Ray.
|
||||
resources:
|
||||
limits:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
requests:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: gcs
|
||||
- containerPort: 8265
|
||||
name: dashboard
|
||||
- containerPort: 10001
|
||||
name: client
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh","-c","ray stop"]
|
||||
workerGroupSpecs:
|
||||
# Start with 0 workers. Allow scaling up to 9 workers.
|
||||
- replicas: 0
|
||||
minReplicas: 0
|
||||
maxReplicas: 9
|
||||
groupName: large-group
|
||||
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
|
||||
rayStartParams:
|
||||
block: 'true'
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
||||
image: rayproject/ray-ml:2.0.0
|
||||
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||
# require some experimentation.
|
||||
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||
# resource accounting. K8s requests are not used by Ray.
|
||||
resources:
|
||||
limits:
|
||||
# Slightly less than 16 to accommodate placement on 16 vCPU virtual machine.
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
requests:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh","-c","ray stop"]
|
||||
# Waits for availability of the Ray head's GCS service.
|
||||
initContainers:
|
||||
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
||||
- name: init-myservice
|
||||
image: busybox:1.28
|
||||
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
|
@ -0,0 +1,124 @@
|
|||
# This is a RayCluster configuration for exploration of the 100Gi Ray-on-XGBoost workload.
|
||||
|
||||
# The configuration includes 1 Ray head pod and 9 Ray worker pods.
|
||||
# Each Ray container requests 54 Gi memory and 14 CPU.
|
||||
|
||||
# For underlying Kubernetes node configuration, we suggest a node group or pool with
|
||||
# the following features:
|
||||
# - 10 virtual machines
|
||||
# - 64 Gi memory and 16 CPU each
|
||||
# (AWS: m5.4xlarge, GCP: e2-standard-16, Azure: Standard_D5_v2)
|
||||
# - Each node should be configured with 1000 Gi of disk space (for data set storage).
|
||||
|
||||
# One Ray pod will be scheduled per Kubernetes node.
|
||||
|
||||
# The suggested gap between the Ray container resource requests and the K8s node's totals accounts
|
||||
# for K8s control processes and cloud-provider-specific daemons.
|
||||
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||
# require some experimentation.
|
||||
#
|
||||
# A note on autoscaling:
|
||||
# If you wish to observe Ray autoscaling in this example, make the following modification:
|
||||
# to your Kubernetes configuration:
|
||||
# - Configure your Kubernetes node group or pool to autoscale with min 1, max 10 nodes.
|
||||
|
||||
# Make the following changes to this configuration file:
|
||||
# 1. Uncomment the line `enableInTreeAutoscaler: True` in this configuration.
|
||||
# 2. Under `workerGroupSpecs` set `replicas: 0` and `minReplicas: 0`.
|
||||
# Alternatively, use the configuration xgboost-benchmark-autoscaler.yaml in this directory;
|
||||
# the config xgboost-benchmark-autoscaler.yaml already includes the above modifications.
|
||||
|
||||
# * The Ray cluster will then start with 0 Ray worker pods. The Ray autoscaler will automatically
|
||||
# scale up to 9 worker pods to accommodate the XGBoost-on-Ray workload.
|
||||
# * The underlying Kubernetes cluster will start with 1 node. The Kubernete cluster autoscaler will
|
||||
# scale up to 9 nodes to accommodate the Ray pods.
|
||||
#
|
||||
# Shortly after the job is complete, the Ray worker pods and corresponding Kubernetes nodes will
|
||||
# be scaled down.
|
||||
---
|
||||
apiVersion: ray.io/v1alpha1
|
||||
kind: RayCluster
|
||||
metadata:
|
||||
labels:
|
||||
controller-tools.k8s.io: "1.0"
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
name: raycluster-xgboost-benchmark
|
||||
spec:
|
||||
# Uncomment the next line to experiment with autoscaling.
|
||||
# enableInTreeAutoscaling: true
|
||||
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
|
||||
rayVersion: '2.0.0'
|
||||
headGroupSpec:
|
||||
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
|
||||
serviceType: ClusterIP
|
||||
rayStartParams:
|
||||
dashboard-host: '0.0.0.0'
|
||||
block: 'true'
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
# The Ray head container
|
||||
- name: ray-head
|
||||
image: rayproject/ray-ml:2.0.0
|
||||
imagePullPolicy: Always
|
||||
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||
# require some experimentation.
|
||||
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||
# resource accounting. K8s requests are not used by Ray.
|
||||
resources:
|
||||
limits:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
requests:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
ports:
|
||||
- containerPort: 6379
|
||||
name: gcs
|
||||
- containerPort: 8265
|
||||
name: dashboard
|
||||
- containerPort: 10001
|
||||
name: client
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh","-c","ray stop"]
|
||||
workerGroupSpecs:
|
||||
- replicas: 9
|
||||
minReplicas: 9
|
||||
maxReplicas: 9
|
||||
# To experiment with autoscaling,
|
||||
# set replicas and minReplicas to 0.
|
||||
# replicas: 0
|
||||
# minReplicas: 0
|
||||
groupName: large-group
|
||||
# the following params are used to complete the ray start: ray start --block
|
||||
rayStartParams:
|
||||
block: 'true'
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
|
||||
image: rayproject/ray-ml:2.0.0
|
||||
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
|
||||
# require some experimentation.
|
||||
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
|
||||
# resource accounting. K8s requests are not used by Ray.
|
||||
resources:
|
||||
limits:
|
||||
# Slightly less than 16 to accomodate placement on 16 vCPU virtual machine.
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
requests:
|
||||
cpu: "14"
|
||||
memory: "54Gi"
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command: ["/bin/sh","-c","ray stop"]
|
||||
# Waits for availability of the Ray head's GCS service.
|
||||
initContainers:
|
||||
# the env var $RAY_IP is set by the operator, with the value of the head service name
|
||||
- name: init-myservice
|
||||
image: busybox:1.28
|
||||
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
|
|
@ -0,0 +1,19 @@
|
|||
from ray.job_submission import JobSubmissionClient
|
||||
|
||||
client = JobSubmissionClient("http://127.0.0.1:8265")
|
||||
|
||||
kick_off_xgboost_benchmark = (
|
||||
# Clone ray. If ray is already present, don't clone again.
|
||||
"git clone https://github.com/ray-project/ray || true;"
|
||||
# Run the benchmark.
|
||||
" python ray/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py"
|
||||
" --size 100G --disable-check"
|
||||
)
|
||||
|
||||
|
||||
submission_id = client.submit_job(
|
||||
entrypoint=kick_off_xgboost_benchmark,
|
||||
)
|
||||
|
||||
print("Use the following command to follow this Job's logs:")
|
||||
print(f"ray job logs '{submission_id}' --follow")
|
|
@ -1,4 +1,11 @@
|
|||
(kuberay-examples)=
|
||||
|
||||
# Examples
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
|
||||
:::{note}
|
||||
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||
:::
|
||||
|
||||
This section presents example Ray workloads try out on your Kubernetes cluster.
|
||||
- {ref}`kuberay-ml-example`
|
||||
|
|
|
@ -1,10 +1,235 @@
|
|||
(kuberay-ml-example)=
|
||||
|
||||
# Example machine learning workloads
|
||||
# XGBoost-Ray on Kubernetes
|
||||
|
||||
:::{warning}
|
||||
This page is under construction!
|
||||
:::{note}
|
||||
To learn the basics of Ray on Kubernetes, we recommend taking a look
|
||||
at the {ref}`introductory guide<kuberay-quickstart>` first.
|
||||
:::
|
||||
|
||||
At least one end-to-end example of an actual machine learning workload,
|
||||
preferably with GPUs, possibly engaging the autoscaling functionality.
|
||||
|
||||
In this guide, we show you how to run a sample Ray machine learning
|
||||
workload on Kubernetes infrastructure.
|
||||
|
||||
We will run Ray's {ref}`XGBoost training benchmark<xgboost-benchmark>` with a 100 gigabyte training set.
|
||||
To learn more about XGBoost-Ray, check out that library's {ref}`documentation<xgboost-ray>`.
|
||||
|
||||
## Kubernetes infrastructure setup
|
||||
|
||||
### Managed Kubernetes services
|
||||
|
||||
Running the example in this guide requires basic Kubernetes infrastructure set-up.
|
||||
We collect helpful links for users who are getting started with a managed Kubernetes service.
|
||||
|
||||
:::{tabbed} GKE (Google Cloud)
|
||||
You can find the landing page for GKE [here](https://cloud.google.com/kubernetes-engine).
|
||||
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||
Alternatively, check out the [documentation](https://cloud.google.com/kubernetes-engine/docs/) and
|
||||
[quickstart guides](https://cloud.google.com/kubernetes-engine/docs/deploy-app-cluster). To successfully deploy Ray on Kubernetes,
|
||||
you will need to configure pools of Kubernetes nodes;
|
||||
find guidance [here](https://cloud.google.com/kubernetes-engine/docs/concepts/node-pools).
|
||||
:::
|
||||
|
||||
:::{tabbed} EKS (Amazon Web Services)
|
||||
You can find the landing page for EKS [here](https://aws.amazon.com/eks/).
|
||||
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||
Alternatively, check out the [documentation](https://docs.aws.amazon.com/eks/latest/userguide/) and
|
||||
[quickstart guides](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html). To successfully deploy Ray on Kubernetes,
|
||||
you will need to configure groups of Kubernetes nodes;
|
||||
find guidance [here](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html).
|
||||
:::
|
||||
|
||||
:::{tabbed} AKS (Microsoft Azure)
|
||||
You can find the landing page for AKS [here](https://azure.microsoft.com/en-us/services/kubernetes-service/).
|
||||
If you have an account set up, you can immediately start experimenting with Kubernetes clusters in the provider's console.
|
||||
Alternatively, check out the [documentation](https://docs.microsoft.com/en-us/azure/aks/) and
|
||||
[quickstart guides](https://docs.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-portal?tabs=azure-cli). To successfully deploy Ray on Kubernetes,
|
||||
you will need to configure pools of Kubernetes nodes;
|
||||
find guidance [here](https://docs.microsoft.com/en-us/azure/aks/use-multiple-node-pools).
|
||||
:::
|
||||
|
||||
```{admonition} Optional: Autoscaling
|
||||
This guide includes notes on how to deploy the XGBoost benchmark with optional Ray Autoscaler support.
|
||||
Here are some considerations to keep in mind when choosing whether to use autoscaling.\
|
||||
**Autoscaling: Pros**\
|
||||
_Cope with unknown resource requirements._ If you don't know how much compute your Ray
|
||||
workload will require, autoscaling can adjust your Ray cluster to the right size.\
|
||||
_Save on costs._ Idle compute is automatically scaled down, potentially leading to cost savings.\
|
||||
**Autoscaling: Cons**\
|
||||
_Less predictable when resource requirements are known._ If you already know exactly
|
||||
how much compute your workload requires, it makes sense to provision a statically-sized Ray cluster.
|
||||
In this guide's example, we know that we need 1 Ray head and 9 Ray workers,
|
||||
so autoscaling is not strictly required.\
|
||||
_Longer end-to-end runtime._ Autoscaling entails provisioning compute for Ray workers
|
||||
while the Ray application is running. On the other hand, if you pre-provision a fixed
|
||||
number of Ray nodes,
|
||||
all of the Ray nodes can be started in parallel, potentially reducing your application's
|
||||
runtime.
|
||||
```
|
||||
|
||||
### Set up a node pool for the XGBoost benchmark
|
||||
|
||||
For the workload in this guide, it is recommended to use a pool or group of Kubernetes nodes
|
||||
with the following properties:
|
||||
- 10 nodes total
|
||||
- A capacity of 16 CPU and 64 Gi memory per node. For the major cloud providers, suitable instance types include
|
||||
* m5.4xlarge (Amazon Web Services)
|
||||
* Standard_D5_v2 (Azure)
|
||||
* e2-standard-16 (Google Cloud)
|
||||
- Each node should be configured with 1000 gigabytes of disk space (to store the training set).
|
||||
|
||||
```{admonition} Optional: Set up an autoscaling node pool
|
||||
**If you would like to try running the workload with autoscaling enabled**, use an autoscaling
|
||||
node group or pool with a 1 node minimum and a 10 node maximum.
|
||||
The 1 static node will be used to run the Ray head pod. This node may also host the KubeRay
|
||||
operator and Kubernetes system components. After the workload is submitted, 9 additional nodes will
|
||||
scale up to accommodate Ray worker pods. These nodes will scale back down after the workload is complete.
|
||||
```
|
||||
|
||||
## Deploying the KubeRay operator
|
||||
|
||||
Once you have set up your Kubernetes cluster, deploy the KubeRay operator:
|
||||
```shell
|
||||
kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=v0.3.0-rc.0"
|
||||
```
|
||||
|
||||
## Deploying a Ray cluster
|
||||
|
||||
Now we're ready to deploy the Ray cluster that will execute our workload.
|
||||
|
||||
:::{tip}
|
||||
The Ray cluster we'll deploy is configured such that one Ray pod will be scheduled
|
||||
per 16-CPU Kubernetes node. The pattern of one Ray pod per Kubernetes node is encouraged, but not required.
|
||||
Broadly speaking, it is more efficient to use a few large Ray pods than many small ones.
|
||||
:::
|
||||
|
||||
We recommend taking a look at the config file applied in the following command.
|
||||
```shell
|
||||
# Starting from the parent directory of cloned Ray master,
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/configs/
|
||||
kubectl apply -f xgboost-benchmark.yaml
|
||||
popd
|
||||
```
|
||||
|
||||
A Ray head pod and 9 Ray worker pods will be created.
|
||||
|
||||
|
||||
```{admonition} Optional: Deploying an autoscaling Ray cluster
|
||||
If you've set up an autoscaling node group or pool, you may wish to deploy
|
||||
an autoscaling cluster by applying the config `xgboost-benchmark-autoscaler.yaml`.
|
||||
One Ray head pod will be created. Once the workload starts, the Ray autoscaler will trigger
|
||||
creation of Ray worker pods. Kubernetes autoscaling will then create nodes to place the Ray pods.
|
||||
```
|
||||
|
||||
## Running the workload
|
||||
|
||||
To observe the startup progress of the Ray head pod, run the following command.
|
||||
|
||||
```shell
|
||||
# If you're on MacOS, first `brew install watch`.
|
||||
watch -n 1 kubectl get pod
|
||||
```
|
||||
|
||||
Once the Ray head pod enters `Running` state, we are ready to execute the XGBoost workload.
|
||||
We will use {ref}`Ray Job Submission<jobs-overview>` to kick off the workload.
|
||||
|
||||
### Connect to the cluster.
|
||||
|
||||
First, we connect to the Job server. Run the following blocking command
|
||||
in a separate shell.
|
||||
```shell
|
||||
kubectl port-forward service/raycluster-xgboost-benchmark-head-svc 8265:8265
|
||||
```
|
||||
|
||||
### Submit the workload.
|
||||
|
||||
We'll use the {ref}`Ray Job Python SDK<ray-job-sdk>` to submit the XGBoost workload.
|
||||
|
||||
```{literalinclude} ../doc_code/xgboost_submit.py
|
||||
:language: python
|
||||
```
|
||||
|
||||
To submit the workload, run the above Python script.
|
||||
The script is available in the Ray repository.
|
||||
|
||||
```shell
|
||||
# From the parent directory of cloned Ray master.
|
||||
pushd ray/doc/source/cluster/cluster_under_construction/ray-clusters-on-kubernetes/doc_code/
|
||||
python xgboost_submit.py
|
||||
popd
|
||||
```
|
||||
|
||||
### Observe progress.
|
||||
|
||||
The benchmark may take up to 30 minutes to run.
|
||||
Use the following tools to observe its progress.
|
||||
|
||||
#### Job logs
|
||||
|
||||
To follow the job's logs, use the command printed by the above submission script.
|
||||
```shell
|
||||
# Subsitute the Ray Job's submission id.
|
||||
ray job logs 'raysubmit_xxxxxxxxxxxxxxxx' --follow
|
||||
```
|
||||
|
||||
#### Kubectl
|
||||
|
||||
Observe the pods in your cluster with
|
||||
```shell
|
||||
# If you're on MacOS, first `brew install watch`.
|
||||
watch -n 1 kubectl get pod
|
||||
```
|
||||
|
||||
#### Ray Dashboard
|
||||
|
||||
View `localhost:8265` in your browser to access the Ray Dashboard.
|
||||
|
||||
#### Ray Status
|
||||
|
||||
Observe autoscaling status and Ray resource usage with
|
||||
```shell
|
||||
# Substitute the name of your Ray cluster's head pod.
|
||||
watch -n 1 kubectl exec -it raycluster-xgboost-benchmark-head-xxxxx -- ray status
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Under some circumstances and for certain cloud providers,
|
||||
the K8s API server may become briefly unavailable during Kuberentes
|
||||
cluster resizing events.
|
||||
|
||||
Don't worry if that happens -- the Ray workload should be uninterrupted.
|
||||
For the example in this guide, wait until the API server is back up, restart the port-forwarding process,
|
||||
and re-run the job log command.
|
||||
:::
|
||||
|
||||
### Job completion
|
||||
|
||||
#### Benchmark results
|
||||
|
||||
Once the benchmark is complete, the job log will display the results:
|
||||
|
||||
```
|
||||
Results: {'training_time': 1338.488839321999, 'prediction_time': 403.36653568099973}
|
||||
```
|
||||
|
||||
The performance of the benchmark is sensitive to the underlying cloud infrastructure --
|
||||
you might not match {ref}`the numbers quoted in the benchmark docs<xgboost-benchmark>`.
|
||||
|
||||
#### Model parameters
|
||||
The file `model.json` in the Ray head pod contains the parameters for the trained model.
|
||||
Other result data will be available in the directory `ray_results` in the head pod.
|
||||
Refer to the {ref}`XGBoost-Ray documentation<xgboost-ray>` for details.
|
||||
|
||||
```{admonition} Scale-down
|
||||
If autoscaling is enabled, Ray worker pods will scale down after 60 seconds.
|
||||
After the Ray worker pods are gone, your Kubernetes infrastructure should scale down
|
||||
the nodes that hosted these pods.
|
||||
```
|
||||
|
||||
#### Clean-up
|
||||
Delete your Ray cluster with the following command:
|
||||
```shell
|
||||
kubectl delete raycluster raycluster-xgboost-benchmark
|
||||
```
|
||||
If you're on a public cloud, don't forget to clean up the underlying
|
||||
node group and/or Kubernetes cluster.
|
||||
|
|
|
@ -85,8 +85,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# After the KubeRay 0.3.0 release branch cut, this documentation will be updated to refer to the 0.3.0 branch.\n",
|
||||
"! git clone https://github.com/ray-project/kuberay\n",
|
||||
"! git clone https://github.com/ray-project/kuberay -b release-0.3\n",
|
||||
"\n",
|
||||
"# This creates the KubeRay operator and all of the resources it needs.\n",
|
||||
"! kubectl create -k kuberay/ray-operator/config/default\n",
|
||||
|
|
|
@ -29,6 +29,17 @@ The Ray docs present all the information you need to start running Ray workloads
|
|||
:type: ref
|
||||
:text: Get Started with Ray on Kubernetes
|
||||
:classes: btn-outline-info btn-block
|
||||
---
|
||||
**Getting started**
|
||||
^^^
|
||||
|
||||
Try example Ray workloads on Kubernetes.
|
||||
|
||||
+++
|
||||
.. link-button:: kuberay-examples
|
||||
:type: ref
|
||||
:text: Example workloads
|
||||
:classes: btn-outline-info btn-block
|
||||
```
|
||||
## The KubeRay project
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ We test out the performance across different cluster sizes and data sizes.
|
|||
- 331 s (786k rows/s)
|
||||
- `python xgboost_benchmark.py --size 100GB`
|
||||
|
||||
.. _xgboost-benchmark:
|
||||
|
||||
XGBoost training
|
||||
----------------
|
||||
|
@ -255,4 +256,4 @@ Performance may vary greatly across different model, hardware, and cluster confi
|
|||
.. _`Pytorch comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
||||
.. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
|
||||
.. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4.yaml
|
||||
.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
||||
.. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4.yaml
|
||||
|
|
|
@ -5,7 +5,7 @@ More Ray ML Libraries
|
|||
Going forward, make sure that all "Ray Lightning" and XGBoost topics are in one document or group,
|
||||
and not next to each other.
|
||||
|
||||
Ray has a variety of different extra integrations with ecosystem libraries.
|
||||
Ray has a variety of additional integrations with ecosystem libraries.
|
||||
|
||||
- :ref:`ray-joblib`
|
||||
- :ref:`lightgbm-ray`
|
||||
|
|
Loading…
Add table
Reference in a new issue